{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6, "eval_steps": 2000, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4e-05, "grad_norm": 456.0, "learning_rate": 1.18e-05, "loss": 85.4554, "loss/crossentropy": 9.650346755981445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 8.066818237304688, "step": 2 }, { "epoch": 8e-05, "grad_norm": 416.0, "learning_rate": 1.3600000000000002e-05, "loss": 84.3418, "loss/crossentropy": 9.544375896453857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 7.628942489624023, "step": 4 }, { "epoch": 0.00012, "grad_norm": 466.0, "learning_rate": 1.54e-05, "loss": 87.2187, "loss/crossentropy": 9.569977283477783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 7.7909746170043945, "step": 6 }, { "epoch": 0.00016, "grad_norm": 247.0, "learning_rate": 1.72e-05, "loss": 82.5078, "loss/crossentropy": 9.06786823272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 7.3673131465911865, "step": 8 }, { "epoch": 0.0002, "grad_norm": 179.0, "learning_rate": 1.9e-05, "loss": 78.2757, "loss/crossentropy": 8.918366432189941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.912693023681641, "step": 10 }, { "epoch": 0.00024, "grad_norm": 148.0, "learning_rate": 2.0800000000000004e-05, "loss": 74.4248, "loss/crossentropy": 8.443636417388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.567321538925171, "step": 12 }, { "epoch": 0.00028, "grad_norm": 131.0, "learning_rate": 2.2600000000000004e-05, "loss": 73.0003, "loss/crossentropy": 8.428278923034668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.706400156021118, "step": 14 }, { "epoch": 0.00032, "grad_norm": 181.0, "grad_norm_var": 16279.8625, "learning_rate": 2.4400000000000004e-05, "loss": 70.0047, "loss/crossentropy": 8.216889381408691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.056080102920532, "step": 16 }, { "epoch": 0.00036, "grad_norm": 90.5, "grad_norm_var": 14154.148958333333, "learning_rate": 2.6200000000000003e-05, "loss": 69.9766, "loss/crossentropy": 8.191599607467651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 6.429446697235107, "step": 18 }, { "epoch": 0.0004, "grad_norm": 52.25, "grad_norm_var": 12194.27890625, "learning_rate": 2.8000000000000003e-05, "loss": 64.3807, "loss/crossentropy": 7.506032228469849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 5.794633388519287, "step": 20 }, { "epoch": 0.00044, "grad_norm": 39.25, "grad_norm_var": 6249.4875, "learning_rate": 2.9800000000000006e-05, "loss": 61.2802, "loss/crossentropy": 7.152851343154907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 5.261489152908325, "step": 22 }, { "epoch": 0.00048, "grad_norm": 57.0, "grad_norm_var": 4626.8875, "learning_rate": 3.16e-05, "loss": 58.3454, "loss/crossentropy": 6.956738471984863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 5.020895004272461, "step": 24 }, { "epoch": 0.00052, "grad_norm": 86.0, "grad_norm_var": 4244.565625, "learning_rate": 3.3400000000000005e-05, "loss": 54.2703, "loss/crossentropy": 6.686542987823486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 4.801911354064941, "step": 26 }, { "epoch": 0.00056, "grad_norm": 110.5, "grad_norm_var": 3868.5875, "learning_rate": 3.520000000000001e-05, "loss": 51.7343, "loss/crossentropy": 6.4867262840271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 4.5746169090271, "step": 28 }, { "epoch": 0.0006, "grad_norm": 50.0, "grad_norm_var": 3953.82890625, "learning_rate": 3.7e-05, "loss": 49.6807, "loss/crossentropy": 6.364065408706665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 4.230688810348511, "step": 30 }, { "epoch": 0.00064, "grad_norm": 68.0, "grad_norm_var": 3157.4958333333334, "learning_rate": 3.88e-05, "loss": 44.7112, "loss/crossentropy": 5.731794834136963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 3.6969637870788574, "step": 32 }, { "epoch": 0.00068, "grad_norm": 50.75, "grad_norm_var": 500.59973958333336, "learning_rate": 4.0600000000000004e-05, "loss": 42.2923, "loss/crossentropy": 5.553718328475952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 3.7271878719329834, "step": 34 }, { "epoch": 0.00072, "grad_norm": 55.5, "grad_norm_var": 342.2122395833333, "learning_rate": 4.240000000000001e-05, "loss": 37.7465, "loss/crossentropy": 5.023651361465454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 3.2670111656188965, "step": 36 }, { "epoch": 0.00076, "grad_norm": 75.5, "grad_norm_var": 283.37395833333335, "learning_rate": 4.420000000000001e-05, "loss": 35.1313, "loss/crossentropy": 4.921839237213135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.950661063194275, "step": 38 }, { "epoch": 0.0008, "grad_norm": 44.5, "grad_norm_var": 299.1958333333333, "learning_rate": 4.600000000000001e-05, "loss": 32.3316, "loss/crossentropy": 4.782621145248413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.7473502159118652, "step": 40 }, { "epoch": 0.00084, "grad_norm": 36.5, "grad_norm_var": 361.1372395833333, "learning_rate": 4.78e-05, "loss": 28.4104, "loss/crossentropy": 3.8754972219467163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.410745143890381, "step": 42 }, { "epoch": 0.00088, "grad_norm": 36.25, "grad_norm_var": 225.80598958333334, "learning_rate": 4.96e-05, "loss": 26.1806, "loss/crossentropy": 3.9885865449905396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.1414425373077393, "step": 44 }, { "epoch": 0.00092, "grad_norm": 46.0, "grad_norm_var": 249.475, "learning_rate": 5.14e-05, "loss": 24.4012, "loss/crossentropy": 3.750515580177307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 2.063339352607727, "step": 46 }, { "epoch": 0.00096, "grad_norm": 20.875, "grad_norm_var": 315.8291015625, "learning_rate": 5.3200000000000006e-05, "loss": 22.822, "loss/crossentropy": 3.7912577390670776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.9196046590805054, "step": 48 }, { "epoch": 0.001, "grad_norm": 35.5, "grad_norm_var": 279.284375, "learning_rate": 5.500000000000001e-05, "loss": 21.036, "loss/crossentropy": 3.777758002281189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.7479944229125977, "step": 50 }, { "epoch": 0.00104, "grad_norm": 20.75, "grad_norm_var": 306.15149739583336, "learning_rate": 5.680000000000001e-05, "loss": 20.3608, "loss/crossentropy": 3.5903185606002808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.5533717274665833, "step": 52 }, { "epoch": 0.00108, "grad_norm": 43.75, "grad_norm_var": 213.07395833333334, "learning_rate": 5.860000000000001e-05, "loss": 18.813, "loss/crossentropy": 3.691780686378479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.4755533933639526, "step": 54 }, { "epoch": 0.00112, "grad_norm": 21.25, "grad_norm_var": 70.690625, "learning_rate": 6.040000000000001e-05, "loss": 19.1421, "loss/crossentropy": 3.557003617286682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.5198156833648682, "step": 56 }, { "epoch": 0.00116, "grad_norm": 21.5, "grad_norm_var": 76.30390625, "learning_rate": 6.220000000000001e-05, "loss": 17.2705, "loss/crossentropy": 3.2730292081832886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.4131997227668762, "step": 58 }, { "epoch": 0.0012, "grad_norm": 19.875, "grad_norm_var": 77.11399739583334, "learning_rate": 6.400000000000001e-05, "loss": 16.4712, "loss/crossentropy": 3.419156074523926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.3277101516723633, "step": 60 }, { "epoch": 0.00124, "grad_norm": 25.75, "grad_norm_var": 48.60520833333333, "learning_rate": 6.58e-05, "loss": 16.6219, "loss/crossentropy": 2.973878502845764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.3438007831573486, "step": 62 }, { "epoch": 0.00128, "grad_norm": 34.5, "grad_norm_var": 53.18020833333333, "learning_rate": 6.76e-05, "loss": 15.0929, "loss/crossentropy": 2.892021059989929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1624282002449036, "step": 64 }, { "epoch": 0.00132, "grad_norm": 15.4375, "grad_norm_var": 51.195947265625, "learning_rate": 6.94e-05, "loss": 15.1967, "loss/crossentropy": 2.954660177230835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.116301715373993, "step": 66 }, { "epoch": 0.00136, "grad_norm": 32.0, "grad_norm_var": 51.064306640625, "learning_rate": 7.120000000000001e-05, "loss": 14.9397, "loss/crossentropy": 3.2686156034469604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1978037357330322, "step": 68 }, { "epoch": 0.0014, "grad_norm": 29.25, "grad_norm_var": 32.794270833333336, "learning_rate": 7.3e-05, "loss": 14.4846, "loss/crossentropy": 2.7956581115722656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1973016262054443, "step": 70 }, { "epoch": 0.00144, "grad_norm": 15.25, "grad_norm_var": 37.80149739583333, "learning_rate": 7.48e-05, "loss": 14.1296, "loss/crossentropy": 3.08966863155365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.151496708393097, "step": 72 }, { "epoch": 0.00148, "grad_norm": 18.625, "grad_norm_var": 41.88984375, "learning_rate": 7.66e-05, "loss": 13.6812, "loss/crossentropy": 2.949987292289734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9787414371967316, "step": 74 }, { "epoch": 0.00152, "grad_norm": 15.1875, "grad_norm_var": 49.143489583333334, "learning_rate": 7.840000000000001e-05, "loss": 12.8901, "loss/crossentropy": 3.1161292791366577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.0640113949775696, "step": 76 }, { "epoch": 0.00156, "grad_norm": 22.125, "grad_norm_var": 47.63170572916667, "learning_rate": 8.020000000000001e-05, "loss": 13.157, "loss/crossentropy": 3.3661664724349976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 1.1353825330734253, "step": 78 }, { "epoch": 0.0016, "grad_norm": 16.25, "grad_norm_var": 35.33274739583333, "learning_rate": 8.200000000000001e-05, "loss": 12.9372, "loss/crossentropy": 2.927241563796997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9984134435653687, "step": 80 }, { "epoch": 0.00164, "grad_norm": 13.625, "grad_norm_var": 37.53951822916667, "learning_rate": 8.38e-05, "loss": 12.0477, "loss/crossentropy": 3.1273285150527954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9241160154342651, "step": 82 }, { "epoch": 0.00168, "grad_norm": 19.125, "grad_norm_var": 19.602718098958334, "learning_rate": 8.560000000000001e-05, "loss": 11.9084, "loss/crossentropy": 2.737278938293457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8822851181030273, "step": 84 }, { "epoch": 0.00172, "grad_norm": 13.0625, "grad_norm_var": 11.107014973958334, "learning_rate": 8.740000000000001e-05, "loss": 11.8594, "loss/crossentropy": 2.4452388286590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8693483769893646, "step": 86 }, { "epoch": 0.00176, "grad_norm": 18.375, "grad_norm_var": 11.328369140625, "learning_rate": 8.92e-05, "loss": 11.5058, "loss/crossentropy": 2.89771831035614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8749278783798218, "step": 88 }, { "epoch": 0.0018, "grad_norm": 12.1875, "grad_norm_var": 13.811572265625, "learning_rate": 9.1e-05, "loss": 11.9281, "loss/crossentropy": 3.0173208713531494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8714744746685028, "step": 90 }, { "epoch": 0.00184, "grad_norm": 20.375, "grad_norm_var": 16.114306640625, "learning_rate": 9.28e-05, "loss": 11.4244, "loss/crossentropy": 2.588515043258667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9651442170143127, "step": 92 }, { "epoch": 0.00188, "grad_norm": 13.375, "grad_norm_var": 12.784309895833333, "learning_rate": 9.46e-05, "loss": 11.6092, "loss/crossentropy": 2.8116774559020996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.9143906235694885, "step": 94 }, { "epoch": 0.00192, "grad_norm": 16.375, "grad_norm_var": 13.225764973958333, "learning_rate": 9.64e-05, "loss": 10.6723, "loss/crossentropy": 2.8734441995620728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.814399927854538, "step": 96 }, { "epoch": 0.00196, "grad_norm": 10.3125, "grad_norm_var": 17.228238932291667, "learning_rate": 9.82e-05, "loss": 10.6577, "loss/crossentropy": 2.664194703102112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7905783653259277, "step": 98 }, { "epoch": 0.002, "grad_norm": 12.3125, "grad_norm_var": 17.113785807291666, "learning_rate": 0.0001, "loss": 10.6847, "loss/crossentropy": 2.4851003885269165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.769305944442749, "step": 100 }, { "epoch": 0.00204, "grad_norm": 10.75, "grad_norm_var": 13.9140625, "learning_rate": 0.0001, "loss": 10.8119, "loss/crossentropy": 2.2757182121276855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8241511881351471, "step": 102 }, { "epoch": 0.00208, "grad_norm": 11.375, "grad_norm_var": 13.615738932291666, "learning_rate": 0.0001, "loss": 10.6244, "loss/crossentropy": 2.7211785316467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.8518709540367126, "step": 104 }, { "epoch": 0.00212, "grad_norm": 11.625, "grad_norm_var": 14.163395182291667, "learning_rate": 0.0001, "loss": 10.5629, "loss/crossentropy": 2.387019991874695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7568954229354858, "step": 106 }, { "epoch": 0.00216, "grad_norm": 9.9375, "grad_norm_var": 10.788525390625, "learning_rate": 0.0001, "loss": 10.1364, "loss/crossentropy": 2.5363346338272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7648341059684753, "step": 108 }, { "epoch": 0.0022, "grad_norm": 20.0, "grad_norm_var": 14.898893229166667, "learning_rate": 0.0001, "loss": 10.8773, "loss/crossentropy": 2.8450236320495605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.737193763256073, "step": 110 }, { "epoch": 0.00224, "grad_norm": 10.25, "grad_norm_var": 15.364518229166666, "learning_rate": 0.0001, "loss": 9.4554, "loss/crossentropy": 2.4827451705932617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6924614012241364, "step": 112 }, { "epoch": 0.00228, "grad_norm": 10.5625, "grad_norm_var": 7.461832682291667, "learning_rate": 0.0001, "loss": 9.9651, "loss/crossentropy": 2.093318462371826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7038464546203613, "step": 114 }, { "epoch": 0.00232, "grad_norm": 9.6875, "grad_norm_var": 7.597330729166667, "learning_rate": 0.0001, "loss": 10.0297, "loss/crossentropy": 2.5149790048599243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6980189085006714, "step": 116 }, { "epoch": 0.00236, "grad_norm": 11.625, "grad_norm_var": 6.672509765625, "learning_rate": 0.0001, "loss": 9.8176, "loss/crossentropy": 2.6022276878356934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7005251348018646, "step": 118 }, { "epoch": 0.0024, "grad_norm": 7.625, "grad_norm_var": 7.298160807291667, "learning_rate": 0.0001, "loss": 9.5658, "loss/crossentropy": 2.6836462020874023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7237774729728699, "step": 120 }, { "epoch": 0.00244, "grad_norm": 9.5625, "grad_norm_var": 7.402018229166667, "learning_rate": 0.0001, "loss": 9.7376, "loss/crossentropy": 2.6823805570602417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7570162117481232, "step": 122 }, { "epoch": 0.00248, "grad_norm": 11.25, "grad_norm_var": 7.391259765625, "learning_rate": 0.0001, "loss": 9.4713, "loss/crossentropy": 2.6233514547348022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7183247208595276, "step": 124 }, { "epoch": 0.00252, "grad_norm": 9.9375, "grad_norm_var": 1.0839680989583333, "learning_rate": 0.0001, "loss": 9.2243, "loss/crossentropy": 2.331676959991455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6982125043869019, "step": 126 }, { "epoch": 0.00256, "grad_norm": 9.1875, "grad_norm_var": 0.9687337239583333, "learning_rate": 0.0001, "loss": 9.4777, "loss/crossentropy": 2.429046392440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.696417510509491, "step": 128 }, { "epoch": 0.0026, "grad_norm": 14.8125, "grad_norm_var": 2.448291015625, "learning_rate": 0.0001, "loss": 9.9024, "loss/crossentropy": 2.5262571573257446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.716008871793747, "step": 130 }, { "epoch": 0.00264, "grad_norm": 8.8125, "grad_norm_var": 2.7044270833333335, "learning_rate": 0.0001, "loss": 9.2836, "loss/crossentropy": 2.187526524066925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.666053056716919, "step": 132 }, { "epoch": 0.00268, "grad_norm": 8.9375, "grad_norm_var": 3.285270182291667, "learning_rate": 0.0001, "loss": 9.8338, "loss/crossentropy": 2.4199057817459106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6709816455841064, "step": 134 }, { "epoch": 0.00272, "grad_norm": 9.6875, "grad_norm_var": 3.4072916666666666, "learning_rate": 0.0001, "loss": 9.4225, "loss/crossentropy": 2.1963008642196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5963725447654724, "step": 136 }, { "epoch": 0.00276, "grad_norm": 8.0625, "grad_norm_var": 3.556884765625, "learning_rate": 0.0001, "loss": 9.44, "loss/crossentropy": 2.5878132581710815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6362220048904419, "step": 138 }, { "epoch": 0.0028, "grad_norm": 9.25, "grad_norm_var": 3.4852701822916665, "learning_rate": 0.0001, "loss": 9.4314, "loss/crossentropy": 2.7800480127334595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6819457113742828, "step": 140 }, { "epoch": 0.00284, "grad_norm": 8.4375, "grad_norm_var": 3.838997395833333, "learning_rate": 0.0001, "loss": 9.1047, "loss/crossentropy": 2.5055110454559326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.674308180809021, "step": 142 }, { "epoch": 0.00288, "grad_norm": 8.25, "grad_norm_var": 4.078499348958333, "learning_rate": 0.0001, "loss": 9.1578, "loss/crossentropy": 2.8532944917678833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.7083481848239899, "step": 144 }, { "epoch": 0.00292, "grad_norm": 7.8125, "grad_norm_var": 2.2759765625, "learning_rate": 0.0001, "loss": 8.8023, "loss/crossentropy": 2.442527174949646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6510869562625885, "step": 146 }, { "epoch": 0.00296, "grad_norm": 10.9375, "grad_norm_var": 2.8544881184895834, "learning_rate": 0.0001, "loss": 8.7238, "loss/crossentropy": 2.516597867012024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6362285614013672, "step": 148 }, { "epoch": 0.003, "grad_norm": 7.40625, "grad_norm_var": 1.8615885416666667, "learning_rate": 0.0001, "loss": 8.5543, "loss/crossentropy": 2.8672900199890137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6459421515464783, "step": 150 }, { "epoch": 0.00304, "grad_norm": 8.1875, "grad_norm_var": 1.7195963541666666, "learning_rate": 0.0001, "loss": 8.6403, "loss/crossentropy": 2.2042795419692993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5720505118370056, "step": 152 }, { "epoch": 0.00308, "grad_norm": 7.8125, "grad_norm_var": 1.876025390625, "learning_rate": 0.0001, "loss": 8.768, "loss/crossentropy": 2.225563883781433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6328675150871277, "step": 154 }, { "epoch": 0.00312, "grad_norm": 8.0625, "grad_norm_var": 1.5925618489583333, "learning_rate": 0.0001, "loss": 8.5743, "loss/crossentropy": 2.3541462421417236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5927431881427765, "step": 156 }, { "epoch": 0.00316, "grad_norm": 7.34375, "grad_norm_var": 1.834619140625, "learning_rate": 0.0001, "loss": 8.7329, "loss/crossentropy": 2.4685616493225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6299368739128113, "step": 158 }, { "epoch": 0.0032, "grad_norm": 9.25, "grad_norm_var": 1.4429646809895833, "learning_rate": 0.0001, "loss": 8.4796, "loss/crossentropy": 2.4637919664382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6084515154361725, "step": 160 }, { "epoch": 0.00324, "grad_norm": 9.0625, "grad_norm_var": 1.3692545572916666, "learning_rate": 0.0001, "loss": 9.0446, "loss/crossentropy": 2.598397374153137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.574739396572113, "step": 162 }, { "epoch": 0.00328, "grad_norm": 7.15625, "grad_norm_var": 0.9171834309895833, "learning_rate": 0.0001, "loss": 8.1508, "loss/crossentropy": 2.5183030366897583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5985860526561737, "step": 164 }, { "epoch": 0.00332, "grad_norm": 9.125, "grad_norm_var": 0.9571614583333333, "learning_rate": 0.0001, "loss": 8.4296, "loss/crossentropy": 2.252183437347412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5627379417419434, "step": 166 }, { "epoch": 0.00336, "grad_norm": 7.875, "grad_norm_var": 0.70533447265625, "learning_rate": 0.0001, "loss": 8.4549, "loss/crossentropy": 2.5720516443252563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5945309698581696, "step": 168 }, { "epoch": 0.0034, "grad_norm": 9.1875, "grad_norm_var": 0.8844034830729167, "learning_rate": 0.0001, "loss": 8.6096, "loss/crossentropy": 2.3004332184791565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5401738286018372, "step": 170 }, { "epoch": 0.00344, "grad_norm": 7.71875, "grad_norm_var": 0.948681640625, "learning_rate": 0.0001, "loss": 8.5484, "loss/crossentropy": 2.689734935760498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6384358406066895, "step": 172 }, { "epoch": 0.00348, "grad_norm": 6.84375, "grad_norm_var": 0.9418253580729167, "learning_rate": 0.0001, "loss": 8.1888, "loss/crossentropy": 1.944397747516632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5828913450241089, "step": 174 }, { "epoch": 0.00352, "grad_norm": 6.53125, "grad_norm_var": 0.8817342122395834, "learning_rate": 0.0001, "loss": 8.0577, "loss/crossentropy": 2.8166507482528687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5916908979415894, "step": 176 }, { "epoch": 0.00356, "grad_norm": 7.09375, "grad_norm_var": 0.7456990559895833, "learning_rate": 0.0001, "loss": 8.7701, "loss/crossentropy": 2.3208402395248413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5968939661979675, "step": 178 }, { "epoch": 0.0036, "grad_norm": 7.25, "grad_norm_var": 0.7425130208333334, "learning_rate": 0.0001, "loss": 8.2615, "loss/crossentropy": 2.817763566970825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5800873041152954, "step": 180 }, { "epoch": 0.00364, "grad_norm": 6.78125, "grad_norm_var": 0.6126912434895834, "learning_rate": 0.0001, "loss": 8.3053, "loss/crossentropy": 2.250023365020752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5279016494750977, "step": 182 }, { "epoch": 0.00368, "grad_norm": 6.09375, "grad_norm_var": 0.6917805989583333, "learning_rate": 0.0001, "loss": 7.7974, "loss/crossentropy": 2.1337096095085144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5393811166286469, "step": 184 }, { "epoch": 0.00372, "grad_norm": 6.34375, "grad_norm_var": 0.5962198893229167, "learning_rate": 0.0001, "loss": 7.7258, "loss/crossentropy": 2.6338934898376465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6444519460201263, "step": 186 }, { "epoch": 0.00376, "grad_norm": 10.4375, "grad_norm_var": 1.2786458333333333, "learning_rate": 0.0001, "loss": 8.0762, "loss/crossentropy": 2.66677463054657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6012931764125824, "step": 188 }, { "epoch": 0.0038, "grad_norm": 6.375, "grad_norm_var": 1.6642537434895834, "learning_rate": 0.0001, "loss": 8.3177, "loss/crossentropy": 2.3731196522712708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.6299596428871155, "step": 190 }, { "epoch": 0.00384, "grad_norm": 7.4375, "grad_norm_var": 1.7719889322916667, "learning_rate": 0.0001, "loss": 8.214, "loss/crossentropy": 2.411492705345154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5328834354877472, "step": 192 }, { "epoch": 0.00388, "grad_norm": 7.15625, "grad_norm_var": 1.9489420572916667, "learning_rate": 0.0001, "loss": 7.9763, "loss/crossentropy": 2.2402734756469727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.539243072271347, "step": 194 }, { "epoch": 0.00392, "grad_norm": 7.53125, "grad_norm_var": 1.895556640625, "learning_rate": 0.0001, "loss": 7.9292, "loss/crossentropy": 2.3250681161880493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5374342203140259, "step": 196 }, { "epoch": 0.00396, "grad_norm": 7.75, "grad_norm_var": 1.8979451497395834, "learning_rate": 0.0001, "loss": 7.9201, "loss/crossentropy": 2.42138135433197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5410921573638916, "step": 198 }, { "epoch": 0.004, "grad_norm": 6.78125, "grad_norm_var": 1.8954386393229166, "learning_rate": 0.0001, "loss": 7.7597, "loss/crossentropy": 2.1954251527786255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.49367184937000275, "step": 200 }, { "epoch": 0.00404, "grad_norm": 6.75, "grad_norm_var": 1.6848307291666667, "learning_rate": 0.0001, "loss": 7.9033, "loss/crossentropy": 2.81479811668396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5567455887794495, "step": 202 }, { "epoch": 0.00408, "grad_norm": 6.5, "grad_norm_var": 1.08228759765625, "learning_rate": 0.0001, "loss": 7.9812, "loss/crossentropy": 2.611761450767517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.544409453868866, "step": 204 }, { "epoch": 0.00412, "grad_norm": 6.125, "grad_norm_var": 0.650244140625, "learning_rate": 0.0001, "loss": 7.7921, "loss/crossentropy": 2.1369245052337646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5181446373462677, "step": 206 }, { "epoch": 0.00416, "grad_norm": 7.1875, "grad_norm_var": 0.30745035807291665, "learning_rate": 0.0001, "loss": 8.3375, "loss/crossentropy": 2.435856580734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5506252646446228, "step": 208 }, { "epoch": 0.0042, "grad_norm": 6.21875, "grad_norm_var": 0.26848958333333334, "learning_rate": 0.0001, "loss": 7.7599, "loss/crossentropy": 2.2404768466949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46975645422935486, "step": 210 }, { "epoch": 0.00424, "grad_norm": 6.71875, "grad_norm_var": 0.19713541666666667, "learning_rate": 0.0001, "loss": 7.7083, "loss/crossentropy": 2.4866777658462524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5217231214046478, "step": 212 }, { "epoch": 0.00428, "grad_norm": 6.15625, "grad_norm_var": 0.121484375, "learning_rate": 0.0001, "loss": 7.6519, "loss/crossentropy": 2.074867010116577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4683506190776825, "step": 214 }, { "epoch": 0.00432, "grad_norm": 7.59375, "grad_norm_var": 0.21679280598958334, "learning_rate": 0.0001, "loss": 7.6062, "loss/crossentropy": 2.2040151357650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5235520601272583, "step": 216 }, { "epoch": 0.00436, "grad_norm": 7.25, "grad_norm_var": 0.25690104166666666, "learning_rate": 0.0001, "loss": 7.886, "loss/crossentropy": 2.174479365348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5243187248706818, "step": 218 }, { "epoch": 0.0044, "grad_norm": 6.28125, "grad_norm_var": 0.26523030598958336, "learning_rate": 0.0001, "loss": 7.7535, "loss/crossentropy": 2.5678584575653076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5818615555763245, "step": 220 }, { "epoch": 0.00444, "grad_norm": 6.0625, "grad_norm_var": 0.2814412434895833, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.4551891088485718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5400412082672119, "step": 222 }, { "epoch": 0.00448, "grad_norm": 6.15625, "grad_norm_var": 0.25310872395833334, "learning_rate": 0.0001, "loss": 7.7341, "loss/crossentropy": 2.0638335943222046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.49279752373695374, "step": 224 }, { "epoch": 0.00452, "grad_norm": 6.6875, "grad_norm_var": 0.38235677083333336, "learning_rate": 0.0001, "loss": 8.1064, "loss/crossentropy": 2.553247332572937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5566797852516174, "step": 226 }, { "epoch": 0.00456, "grad_norm": 6.0625, "grad_norm_var": 0.39451497395833335, "learning_rate": 0.0001, "loss": 7.7812, "loss/crossentropy": 2.544332265853882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5118530094623566, "step": 228 }, { "epoch": 0.0046, "grad_norm": 5.875, "grad_norm_var": 0.4054036458333333, "learning_rate": 0.0001, "loss": 7.064, "loss/crossentropy": 2.191234052181244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5157675743103027, "step": 230 }, { "epoch": 0.00464, "grad_norm": 5.4375, "grad_norm_var": 0.360791015625, "learning_rate": 0.0001, "loss": 7.6611, "loss/crossentropy": 2.3671151399612427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5233491659164429, "step": 232 }, { "epoch": 0.00468, "grad_norm": 7.125, "grad_norm_var": 0.4180826822916667, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.3003920316696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5037694871425629, "step": 234 }, { "epoch": 0.00472, "grad_norm": 5.625, "grad_norm_var": 0.4305338541666667, "learning_rate": 0.0001, "loss": 7.8236, "loss/crossentropy": 2.4672670364379883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5187652707099915, "step": 236 }, { "epoch": 0.00476, "grad_norm": 6.0625, "grad_norm_var": 0.4493326822916667, "learning_rate": 0.0001, "loss": 7.3246, "loss/crossentropy": 2.179289937019348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5111505687236786, "step": 238 }, { "epoch": 0.0048, "grad_norm": 6.34375, "grad_norm_var": 0.5123697916666666, "learning_rate": 0.0001, "loss": 7.6064, "loss/crossentropy": 2.2424627542495728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5187103897333145, "step": 240 }, { "epoch": 0.00484, "grad_norm": 5.96875, "grad_norm_var": 0.2986979166666667, "learning_rate": 0.0001, "loss": 7.8108, "loss/crossentropy": 2.8024520874023438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5700482130050659, "step": 242 }, { "epoch": 0.00488, "grad_norm": 6.25, "grad_norm_var": 0.4554524739583333, "learning_rate": 0.0001, "loss": 7.6644, "loss/crossentropy": 2.3653491735458374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5232449471950531, "step": 244 }, { "epoch": 0.00492, "grad_norm": 6.90625, "grad_norm_var": 0.48513997395833336, "learning_rate": 0.0001, "loss": 7.5028, "loss/crossentropy": 2.6201778650283813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5684538185596466, "step": 246 }, { "epoch": 0.00496, "grad_norm": 7.3125, "grad_norm_var": 0.49654541015625, "learning_rate": 0.0001, "loss": 7.6631, "loss/crossentropy": 2.2811471819877625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5543638169765472, "step": 248 }, { "epoch": 0.005, "grad_norm": 5.625, "grad_norm_var": 0.48544514973958336, "learning_rate": 0.0001, "loss": 7.6914, "loss/crossentropy": 2.4381459951400757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5594777166843414, "step": 250 }, { "epoch": 0.00504, "grad_norm": 9.375, "grad_norm_var": 1.04000244140625, "learning_rate": 0.0001, "loss": 7.7346, "loss/crossentropy": 2.435782313346863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5159732699394226, "step": 252 }, { "epoch": 0.00508, "grad_norm": 5.46875, "grad_norm_var": 1.0892862955729166, "learning_rate": 0.0001, "loss": 8.0282, "loss/crossentropy": 2.7867215871810913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.551640123128891, "step": 254 }, { "epoch": 0.00512, "grad_norm": 5.46875, "grad_norm_var": 1.2432902018229166, "learning_rate": 0.0001, "loss": 7.0744, "loss/crossentropy": 1.9328945875167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45153285562992096, "step": 256 }, { "epoch": 0.00516, "grad_norm": 6.21875, "grad_norm_var": 1.2460286458333334, "learning_rate": 0.0001, "loss": 7.292, "loss/crossentropy": 2.552613139152527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5215992629528046, "step": 258 }, { "epoch": 0.0052, "grad_norm": 5.40625, "grad_norm_var": 1.1848958333333333, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 2.3720492124557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5024219453334808, "step": 260 }, { "epoch": 0.00524, "grad_norm": 8.1875, "grad_norm_var": 1.4898274739583333, "learning_rate": 0.0001, "loss": 7.4676, "loss/crossentropy": 2.465815782546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5383751839399338, "step": 262 }, { "epoch": 0.00528, "grad_norm": 6.0625, "grad_norm_var": 1.5113240559895833, "learning_rate": 0.0001, "loss": 7.3163, "loss/crossentropy": 2.2791935205459595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.534159854054451, "step": 264 }, { "epoch": 0.00532, "grad_norm": 6.28125, "grad_norm_var": 1.3855305989583333, "learning_rate": 0.0001, "loss": 7.9279, "loss/crossentropy": 2.48906409740448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5344790518283844, "step": 266 }, { "epoch": 0.00536, "grad_norm": 4.90625, "grad_norm_var": 1.0180826822916667, "learning_rate": 0.0001, "loss": 7.3178, "loss/crossentropy": 2.0858306884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.47646892070770264, "step": 268 }, { "epoch": 0.0054, "grad_norm": 8.375, "grad_norm_var": 1.18258056640625, "learning_rate": 0.0001, "loss": 7.383, "loss/crossentropy": 2.159322738647461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5543113648891449, "step": 270 }, { "epoch": 0.00544, "grad_norm": 4.9375, "grad_norm_var": 1.204931640625, "learning_rate": 0.0001, "loss": 7.1635, "loss/crossentropy": 2.249913454055786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4896702766418457, "step": 272 }, { "epoch": 0.00548, "grad_norm": 8.125, "grad_norm_var": 1.388916015625, "learning_rate": 0.0001, "loss": 7.3565, "loss/crossentropy": 1.998712420463562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4770403504371643, "step": 274 }, { "epoch": 0.00552, "grad_norm": 5.53125, "grad_norm_var": 1.9266764322916667, "learning_rate": 0.0001, "loss": 7.6522, "loss/crossentropy": 2.391260862350464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5259605348110199, "step": 276 }, { "epoch": 0.00556, "grad_norm": 6.1875, "grad_norm_var": 1.6884765625, "learning_rate": 0.0001, "loss": 7.2935, "loss/crossentropy": 2.523361325263977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4724537879228592, "step": 278 }, { "epoch": 0.0056, "grad_norm": 5.34375, "grad_norm_var": 1.7386555989583334, "learning_rate": 0.0001, "loss": 7.3505, "loss/crossentropy": 2.281963586807251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.49724647402763367, "step": 280 }, { "epoch": 0.00564, "grad_norm": 5.0625, "grad_norm_var": 1.8446451822916667, "learning_rate": 0.0001, "loss": 7.1079, "loss/crossentropy": 2.2403814792633057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45384156703948975, "step": 282 }, { "epoch": 0.00568, "grad_norm": 5.6875, "grad_norm_var": 1.670166015625, "learning_rate": 0.0001, "loss": 7.4318, "loss/crossentropy": 2.2687963247299194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46648281812667847, "step": 284 }, { "epoch": 0.00572, "grad_norm": 5.75, "grad_norm_var": 1.38541259765625, "learning_rate": 0.0001, "loss": 7.3035, "loss/crossentropy": 2.3336217403411865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4792183041572571, "step": 286 }, { "epoch": 0.00576, "grad_norm": 6.4375, "grad_norm_var": 1.292431640625, "learning_rate": 0.0001, "loss": 7.0031, "loss/crossentropy": 2.4006571769714355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.473391056060791, "step": 288 }, { "epoch": 0.0058, "grad_norm": 5.09375, "grad_norm_var": 1.18671875, "learning_rate": 0.0001, "loss": 6.8037, "loss/crossentropy": 2.0306124687194824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4722501188516617, "step": 290 }, { "epoch": 0.00584, "grad_norm": 5.34375, "grad_norm_var": 0.38865559895833335, "learning_rate": 0.0001, "loss": 7.1378, "loss/crossentropy": 2.412277102470398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48340730369091034, "step": 292 }, { "epoch": 0.00588, "grad_norm": 6.21875, "grad_norm_var": 0.3952433268229167, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.3195769786834717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5074818134307861, "step": 294 }, { "epoch": 0.00592, "grad_norm": 6.71875, "grad_norm_var": 0.2867024739583333, "learning_rate": 0.0001, "loss": 7.2826, "loss/crossentropy": 2.3265275955200195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5351093411445618, "step": 296 }, { "epoch": 0.00596, "grad_norm": 5.625, "grad_norm_var": 0.26171468098958334, "learning_rate": 0.0001, "loss": 7.0106, "loss/crossentropy": 2.210574746131897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45469868183135986, "step": 298 }, { "epoch": 0.006, "grad_norm": 6.3125, "grad_norm_var": 0.3034138997395833, "learning_rate": 0.0001, "loss": 7.4741, "loss/crossentropy": 2.347964644432068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45269395411014557, "step": 300 }, { "epoch": 0.00604, "grad_norm": 5.0, "grad_norm_var": 0.34308268229166666, "learning_rate": 0.0001, "loss": 6.8901, "loss/crossentropy": 2.197494626045227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45010584592819214, "step": 302 }, { "epoch": 0.00608, "grad_norm": 5.8125, "grad_norm_var": 0.32224934895833335, "learning_rate": 0.0001, "loss": 6.909, "loss/crossentropy": 2.3295196890830994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48989084362983704, "step": 304 }, { "epoch": 0.00612, "grad_norm": 6.3125, "grad_norm_var": 0.26347249348958335, "learning_rate": 0.0001, "loss": 7.6606, "loss/crossentropy": 2.5208678245544434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46991507709026337, "step": 306 }, { "epoch": 0.00616, "grad_norm": 5.40625, "grad_norm_var": 0.22849934895833332, "learning_rate": 0.0001, "loss": 7.2881, "loss/crossentropy": 2.6091307401657104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5090319812297821, "step": 308 }, { "epoch": 0.0062, "grad_norm": 5.4375, "grad_norm_var": 0.3083984375, "learning_rate": 0.0001, "loss": 7.0888, "loss/crossentropy": 2.4142966270446777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45959460735321045, "step": 310 }, { "epoch": 0.00624, "grad_norm": 6.0625, "grad_norm_var": 0.25006103515625, "learning_rate": 0.0001, "loss": 7.3054, "loss/crossentropy": 2.3062673807144165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4381408095359802, "step": 312 }, { "epoch": 0.00628, "grad_norm": 4.875, "grad_norm_var": 0.29498697916666666, "learning_rate": 0.0001, "loss": 6.5202, "loss/crossentropy": 2.1124885082244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4392775595188141, "step": 314 }, { "epoch": 0.00632, "grad_norm": 5.09375, "grad_norm_var": 0.3001302083333333, "learning_rate": 0.0001, "loss": 6.3297, "loss/crossentropy": 2.0250568985939026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4133095294237137, "step": 316 }, { "epoch": 0.00636, "grad_norm": 5.625, "grad_norm_var": 0.31021728515625, "learning_rate": 0.0001, "loss": 6.9903, "loss/crossentropy": 2.4011316299438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46330246329307556, "step": 318 }, { "epoch": 0.0064, "grad_norm": 5.65625, "grad_norm_var": 0.30305582682291665, "learning_rate": 0.0001, "loss": 7.2114, "loss/crossentropy": 2.487559676170349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.47237157821655273, "step": 320 }, { "epoch": 0.00644, "grad_norm": 5.1875, "grad_norm_var": 0.2775349934895833, "learning_rate": 0.0001, "loss": 6.5935, "loss/crossentropy": 1.999566912651062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4165455400943756, "step": 322 }, { "epoch": 0.00648, "grad_norm": 6.03125, "grad_norm_var": 0.27496337890625, "learning_rate": 0.0001, "loss": 7.0573, "loss/crossentropy": 2.545841693878174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4725654572248459, "step": 324 }, { "epoch": 0.00652, "grad_norm": 5.0625, "grad_norm_var": 0.2528483072916667, "learning_rate": 0.0001, "loss": 7.2351, "loss/crossentropy": 2.119086444377899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4649319499731064, "step": 326 }, { "epoch": 0.00656, "grad_norm": 5.21875, "grad_norm_var": 0.221728515625, "learning_rate": 0.0001, "loss": 6.8367, "loss/crossentropy": 2.365525245666504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5121739506721497, "step": 328 }, { "epoch": 0.0066, "grad_norm": 5.25, "grad_norm_var": 0.17467041015625, "learning_rate": 0.0001, "loss": 6.8384, "loss/crossentropy": 2.2604740858078003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4704676419496536, "step": 330 }, { "epoch": 0.00664, "grad_norm": 6.15625, "grad_norm_var": 0.18977864583333334, "learning_rate": 0.0001, "loss": 7.5125, "loss/crossentropy": 2.4891955852508545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5196183770895004, "step": 332 }, { "epoch": 0.00668, "grad_norm": 5.46875, "grad_norm_var": 0.1767578125, "learning_rate": 0.0001, "loss": 7.3139, "loss/crossentropy": 2.430082321166992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.47671228647232056, "step": 334 }, { "epoch": 0.00672, "grad_norm": 5.53125, "grad_norm_var": 0.187353515625, "learning_rate": 0.0001, "loss": 6.6969, "loss/crossentropy": 2.2450510263442993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5124029517173767, "step": 336 }, { "epoch": 0.00676, "grad_norm": 5.375, "grad_norm_var": 0.18251546223958334, "learning_rate": 0.0001, "loss": 6.8537, "loss/crossentropy": 2.225212812423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4358299970626831, "step": 338 }, { "epoch": 0.0068, "grad_norm": 6.71875, "grad_norm_var": 26.47734375, "learning_rate": 0.0001, "loss": 6.8775, "loss/crossentropy": 2.320846140384674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42757023870944977, "step": 340 }, { "epoch": 0.00684, "grad_norm": 4.84375, "grad_norm_var": 26.739453125, "learning_rate": 0.0001, "loss": 6.7394, "loss/crossentropy": 2.419093132019043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4633703678846359, "step": 342 }, { "epoch": 0.00688, "grad_norm": 5.3125, "grad_norm_var": 26.632405598958332, "learning_rate": 0.0001, "loss": 6.7304, "loss/crossentropy": 1.939517080783844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4387335330247879, "step": 344 }, { "epoch": 0.00692, "grad_norm": 5.75, "grad_norm_var": 26.504410807291666, "learning_rate": 0.0001, "loss": 7.0914, "loss/crossentropy": 2.695888638496399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48031529784202576, "step": 346 }, { "epoch": 0.00696, "grad_norm": 6.625, "grad_norm_var": 26.468094889322916, "learning_rate": 0.0001, "loss": 6.8381, "loss/crossentropy": 2.245330333709717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5032568573951721, "step": 348 }, { "epoch": 0.007, "grad_norm": 4.28125, "grad_norm_var": 26.707421875, "learning_rate": 0.0001, "loss": 6.4774, "loss/crossentropy": 1.9668607115745544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4390462785959244, "step": 350 }, { "epoch": 0.00704, "grad_norm": 5.25, "grad_norm_var": 26.92008056640625, "learning_rate": 0.0001, "loss": 6.7795, "loss/crossentropy": 2.4035123586654663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43734824657440186, "step": 352 }, { "epoch": 0.00708, "grad_norm": 6.5, "grad_norm_var": 26.851460774739582, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.4636529684066772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5208825469017029, "step": 354 }, { "epoch": 0.00712, "grad_norm": 6.4375, "grad_norm_var": 0.5743123372395833, "learning_rate": 0.0001, "loss": 6.8482, "loss/crossentropy": 2.085066556930542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3971068561077118, "step": 356 }, { "epoch": 0.00716, "grad_norm": 4.59375, "grad_norm_var": 0.57847900390625, "learning_rate": 0.0001, "loss": 6.8962, "loss/crossentropy": 2.194266200065613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41747787594795227, "step": 358 }, { "epoch": 0.0072, "grad_norm": 4.59375, "grad_norm_var": 0.649462890625, "learning_rate": 0.0001, "loss": 6.7381, "loss/crossentropy": 2.4678618907928467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4633233994245529, "step": 360 }, { "epoch": 0.00724, "grad_norm": 5.59375, "grad_norm_var": 0.6476847330729166, "learning_rate": 0.0001, "loss": 6.5748, "loss/crossentropy": 2.362962484359741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4360974431037903, "step": 362 }, { "epoch": 0.00728, "grad_norm": 5.6875, "grad_norm_var": 0.5360514322916666, "learning_rate": 0.0001, "loss": 7.3497, "loss/crossentropy": 2.3162096738815308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4641396403312683, "step": 364 }, { "epoch": 0.00732, "grad_norm": 6.0625, "grad_norm_var": 0.4325358072916667, "learning_rate": 0.0001, "loss": 7.0856, "loss/crossentropy": 2.279396176338196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4331911951303482, "step": 366 }, { "epoch": 0.00736, "grad_norm": 4.96875, "grad_norm_var": 0.377587890625, "learning_rate": 0.0001, "loss": 6.8288, "loss/crossentropy": 2.333961606025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43770918250083923, "step": 368 }, { "epoch": 0.0074, "grad_norm": 5.1875, "grad_norm_var": 0.27955322265625, "learning_rate": 0.0001, "loss": 6.9245, "loss/crossentropy": 2.130259871482849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4134685546159744, "step": 370 }, { "epoch": 0.00744, "grad_norm": 5.59375, "grad_norm_var": 0.19478759765625, "learning_rate": 0.0001, "loss": 6.4884, "loss/crossentropy": 2.3000282049179077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46207693219184875, "step": 372 }, { "epoch": 0.00748, "grad_norm": 5.59375, "grad_norm_var": 0.20399983723958334, "learning_rate": 0.0001, "loss": 7.3714, "loss/crossentropy": 2.687412142753601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46891947090625763, "step": 374 }, { "epoch": 0.00752, "grad_norm": 4.28125, "grad_norm_var": 0.21962483723958334, "learning_rate": 0.0001, "loss": 6.4194, "loss/crossentropy": 2.2366563081741333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42876073718070984, "step": 376 }, { "epoch": 0.00756, "grad_norm": 5.71875, "grad_norm_var": 0.23108317057291666, "learning_rate": 0.0001, "loss": 7.0141, "loss/crossentropy": 2.5960274934768677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4519210159778595, "step": 378 }, { "epoch": 0.0076, "grad_norm": 5.71875, "grad_norm_var": 0.23435872395833332, "learning_rate": 0.0001, "loss": 6.9654, "loss/crossentropy": 2.4690704345703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5009289383888245, "step": 380 }, { "epoch": 0.00764, "grad_norm": 5.25, "grad_norm_var": 0.17952067057291668, "learning_rate": 0.0001, "loss": 6.6068, "loss/crossentropy": 2.188890814781189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4912077784538269, "step": 382 }, { "epoch": 0.00768, "grad_norm": 5.15625, "grad_norm_var": 0.19055582682291666, "learning_rate": 0.0001, "loss": 6.5789, "loss/crossentropy": 2.2374125719070435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4437579810619354, "step": 384 }, { "epoch": 0.00772, "grad_norm": 5.1875, "grad_norm_var": 0.19308268229166667, "learning_rate": 0.0001, "loss": 6.8081, "loss/crossentropy": 2.101546287536621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39717453718185425, "step": 386 }, { "epoch": 0.00776, "grad_norm": 5.1875, "grad_norm_var": 0.182666015625, "learning_rate": 0.0001, "loss": 6.5688, "loss/crossentropy": 2.2907408475875854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4657403528690338, "step": 388 }, { "epoch": 0.0078, "grad_norm": 4.53125, "grad_norm_var": 0.18203125, "learning_rate": 0.0001, "loss": 6.4664, "loss/crossentropy": 1.9909976720809937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39810416102409363, "step": 390 }, { "epoch": 0.00784, "grad_norm": 6.28125, "grad_norm_var": 0.23746337890625, "learning_rate": 0.0001, "loss": 6.6015, "loss/crossentropy": 2.109456777572632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4226441979408264, "step": 392 }, { "epoch": 0.00788, "grad_norm": 6.03125, "grad_norm_var": 0.26549072265625, "learning_rate": 0.0001, "loss": 7.2592, "loss/crossentropy": 2.330615997314453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5082881152629852, "step": 394 }, { "epoch": 0.00792, "grad_norm": 5.90625, "grad_norm_var": 0.49332275390625, "learning_rate": 0.0001, "loss": 7.301, "loss/crossentropy": 2.37632155418396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5454596877098083, "step": 396 }, { "epoch": 0.00796, "grad_norm": 4.9375, "grad_norm_var": 0.5669921875, "learning_rate": 0.0001, "loss": 6.6057, "loss/crossentropy": 2.0303866863250732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3780263066291809, "step": 398 }, { "epoch": 0.008, "grad_norm": 4.1875, "grad_norm_var": 0.6671712239583333, "learning_rate": 0.0001, "loss": 6.8129, "loss/crossentropy": 2.077945590019226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4111042767763138, "step": 400 }, { "epoch": 0.00804, "grad_norm": 5.5625, "grad_norm_var": 0.6867146809895833, "learning_rate": 0.0001, "loss": 6.919, "loss/crossentropy": 2.3042391538619995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44224822521209717, "step": 402 }, { "epoch": 0.00808, "grad_norm": 4.3125, "grad_norm_var": 0.74537353515625, "learning_rate": 0.0001, "loss": 6.4785, "loss/crossentropy": 2.15978467464447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44845885038375854, "step": 404 }, { "epoch": 0.00812, "grad_norm": 4.90625, "grad_norm_var": 0.69576416015625, "learning_rate": 0.0001, "loss": 6.3875, "loss/crossentropy": 2.4571259021759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4629521369934082, "step": 406 }, { "epoch": 0.00816, "grad_norm": 5.40625, "grad_norm_var": 0.6654947916666667, "learning_rate": 0.0001, "loss": 7.1165, "loss/crossentropy": 2.653234601020813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5556869208812714, "step": 408 }, { "epoch": 0.0082, "grad_norm": 4.53125, "grad_norm_var": 0.66138916015625, "learning_rate": 0.0001, "loss": 6.6643, "loss/crossentropy": 1.9738067388534546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42570993304252625, "step": 410 }, { "epoch": 0.00824, "grad_norm": 4.0625, "grad_norm_var": 0.46920166015625, "learning_rate": 0.0001, "loss": 6.2205, "loss/crossentropy": 2.093988060951233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40544557571411133, "step": 412 }, { "epoch": 0.00828, "grad_norm": 4.34375, "grad_norm_var": 0.36213785807291665, "learning_rate": 0.0001, "loss": 6.6356, "loss/crossentropy": 2.4798851013183594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4204765260219574, "step": 414 }, { "epoch": 0.00832, "grad_norm": 5.625, "grad_norm_var": 0.3986287434895833, "learning_rate": 0.0001, "loss": 6.5601, "loss/crossentropy": 2.4342020750045776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5148662775754929, "step": 416 }, { "epoch": 0.00836, "grad_norm": 5.40625, "grad_norm_var": 0.3985514322916667, "learning_rate": 0.0001, "loss": 6.7757, "loss/crossentropy": 2.3637804985046387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45475171506404877, "step": 418 }, { "epoch": 0.0084, "grad_norm": 4.09375, "grad_norm_var": 0.39586181640625, "learning_rate": 0.0001, "loss": 6.6923, "loss/crossentropy": 2.4066261053085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45181581377983093, "step": 420 }, { "epoch": 0.00844, "grad_norm": 4.0, "grad_norm_var": 0.43176676432291666, "learning_rate": 0.0001, "loss": 6.2428, "loss/crossentropy": 2.1273797750473022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3976929485797882, "step": 422 }, { "epoch": 0.00848, "grad_norm": 4.90625, "grad_norm_var": 0.26330973307291666, "learning_rate": 0.0001, "loss": 6.6524, "loss/crossentropy": 2.4227113723754883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46154454350471497, "step": 424 }, { "epoch": 0.00852, "grad_norm": 4.65625, "grad_norm_var": 0.261181640625, "learning_rate": 0.0001, "loss": 6.6558, "loss/crossentropy": 2.3502479791641235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43580910563468933, "step": 426 }, { "epoch": 0.00856, "grad_norm": 5.3125, "grad_norm_var": 0.26236572265625, "learning_rate": 0.0001, "loss": 6.903, "loss/crossentropy": 2.5034282207489014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4730219095945358, "step": 428 }, { "epoch": 0.0086, "grad_norm": 4.40625, "grad_norm_var": 0.2557576497395833, "learning_rate": 0.0001, "loss": 6.2148, "loss/crossentropy": 2.0902098417282104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.409458264708519, "step": 430 }, { "epoch": 0.00864, "grad_norm": 4.9375, "grad_norm_var": 0.19420166015625, "learning_rate": 0.0001, "loss": 6.4634, "loss/crossentropy": 2.2203429341316223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41066519916057587, "step": 432 }, { "epoch": 0.00868, "grad_norm": 5.03125, "grad_norm_var": 0.14855143229166667, "learning_rate": 0.0001, "loss": 6.6943, "loss/crossentropy": 2.568304419517517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48846572637557983, "step": 434 }, { "epoch": 0.00872, "grad_norm": 5.15625, "grad_norm_var": 0.13489176432291666, "learning_rate": 0.0001, "loss": 6.4829, "loss/crossentropy": 2.359646439552307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43610572814941406, "step": 436 }, { "epoch": 0.00876, "grad_norm": 5.03125, "grad_norm_var": 0.08870035807291667, "learning_rate": 0.0001, "loss": 6.6119, "loss/crossentropy": 2.2751121520996094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45837563276290894, "step": 438 }, { "epoch": 0.0088, "grad_norm": 4.125, "grad_norm_var": 0.13665364583333334, "learning_rate": 0.0001, "loss": 6.5338, "loss/crossentropy": 2.334506392478943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4196038395166397, "step": 440 }, { "epoch": 0.00884, "grad_norm": 4.1875, "grad_norm_var": 0.15891927083333332, "learning_rate": 0.0001, "loss": 6.2206, "loss/crossentropy": 1.9731069803237915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41661541163921356, "step": 442 }, { "epoch": 0.00888, "grad_norm": 5.03125, "grad_norm_var": 0.13339436848958333, "learning_rate": 0.0001, "loss": 6.3377, "loss/crossentropy": 2.319058418273926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4656156301498413, "step": 444 }, { "epoch": 0.00892, "grad_norm": 4.125, "grad_norm_var": 0.15013020833333332, "learning_rate": 0.0001, "loss": 6.5345, "loss/crossentropy": 2.309122085571289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41400712728500366, "step": 446 }, { "epoch": 0.00896, "grad_norm": 4.53125, "grad_norm_var": 0.14468994140625, "learning_rate": 0.0001, "loss": 6.5385, "loss/crossentropy": 1.867617905139923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39080700278282166, "step": 448 }, { "epoch": 0.009, "grad_norm": 5.21875, "grad_norm_var": 0.79830322265625, "learning_rate": 0.0001, "loss": 6.799, "loss/crossentropy": 2.205033838748932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4887084364891052, "step": 450 }, { "epoch": 0.00904, "grad_norm": 4.9375, "grad_norm_var": 0.826171875, "learning_rate": 0.0001, "loss": 6.6476, "loss/crossentropy": 2.3054174184799194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4140937328338623, "step": 452 }, { "epoch": 0.00908, "grad_norm": 4.53125, "grad_norm_var": 0.8572224934895833, "learning_rate": 0.0001, "loss": 6.7036, "loss/crossentropy": 2.1358219981193542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3849467635154724, "step": 454 }, { "epoch": 0.00912, "grad_norm": 4.9375, "grad_norm_var": 0.8132120768229166, "learning_rate": 0.0001, "loss": 6.4024, "loss/crossentropy": 1.9811997413635254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42263032495975494, "step": 456 }, { "epoch": 0.00916, "grad_norm": 4.8125, "grad_norm_var": 0.7787109375, "learning_rate": 0.0001, "loss": 6.8381, "loss/crossentropy": 2.319555103778839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4539293050765991, "step": 458 }, { "epoch": 0.0092, "grad_norm": 4.6875, "grad_norm_var": 0.7744425455729167, "learning_rate": 0.0001, "loss": 6.72, "loss/crossentropy": 2.4030569791793823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41373930871486664, "step": 460 }, { "epoch": 0.00924, "grad_norm": 4.8125, "grad_norm_var": 0.7218587239583333, "learning_rate": 0.0001, "loss": 6.7376, "loss/crossentropy": 2.4479328393936157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.447835311293602, "step": 462 }, { "epoch": 0.00928, "grad_norm": 5.5, "grad_norm_var": 0.7289021809895834, "learning_rate": 0.0001, "loss": 7.0562, "loss/crossentropy": 2.056324601173401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4374549984931946, "step": 464 }, { "epoch": 0.00932, "grad_norm": 4.28125, "grad_norm_var": 0.19256184895833334, "learning_rate": 0.0001, "loss": 6.6994, "loss/crossentropy": 2.4104079008102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4148041307926178, "step": 466 }, { "epoch": 0.00936, "grad_norm": 4.75, "grad_norm_var": 0.13527018229166668, "learning_rate": 0.0001, "loss": 7.0758, "loss/crossentropy": 2.734652876853943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.511719822883606, "step": 468 }, { "epoch": 0.0094, "grad_norm": 5.0, "grad_norm_var": 0.13118082682291668, "learning_rate": 0.0001, "loss": 6.4772, "loss/crossentropy": 2.1748571395874023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3852091133594513, "step": 470 }, { "epoch": 0.00944, "grad_norm": 4.625, "grad_norm_var": 0.13912353515625, "learning_rate": 0.0001, "loss": 6.7127, "loss/crossentropy": 2.476449966430664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4194856435060501, "step": 472 }, { "epoch": 0.00948, "grad_norm": 4.40625, "grad_norm_var": 0.16887613932291667, "learning_rate": 0.0001, "loss": 6.494, "loss/crossentropy": 2.6383973360061646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44083887338638306, "step": 474 }, { "epoch": 0.00952, "grad_norm": 4.375, "grad_norm_var": 0.18435872395833333, "learning_rate": 0.0001, "loss": 6.3184, "loss/crossentropy": 2.3149259090423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3863615244626999, "step": 476 }, { "epoch": 0.00956, "grad_norm": 5.0625, "grad_norm_var": 0.18388264973958332, "learning_rate": 0.0001, "loss": 6.6217, "loss/crossentropy": 2.3096635341644287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40995509922504425, "step": 478 }, { "epoch": 0.0096, "grad_norm": 4.34375, "grad_norm_var": 0.10846354166666666, "learning_rate": 0.0001, "loss": 6.4849, "loss/crossentropy": 2.6495853662490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.412080317735672, "step": 480 }, { "epoch": 0.00964, "grad_norm": 5.21875, "grad_norm_var": 0.13948160807291668, "learning_rate": 0.0001, "loss": 6.9293, "loss/crossentropy": 2.445754885673523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45622071623802185, "step": 482 }, { "epoch": 0.00968, "grad_norm": 4.40625, "grad_norm_var": 0.13661702473958334, "learning_rate": 0.0001, "loss": 6.4119, "loss/crossentropy": 2.418110966682434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40424875915050507, "step": 484 }, { "epoch": 0.00972, "grad_norm": 5.0625, "grad_norm_var": 0.15959879557291667, "learning_rate": 0.0001, "loss": 6.6356, "loss/crossentropy": 1.9564435482025146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.45834848284721375, "step": 486 }, { "epoch": 0.00976, "grad_norm": 5.375, "grad_norm_var": 0.16946207682291667, "learning_rate": 0.0001, "loss": 6.7056, "loss/crossentropy": 2.3772581815719604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4418800473213196, "step": 488 }, { "epoch": 0.0098, "grad_norm": 4.0625, "grad_norm_var": 0.208056640625, "learning_rate": 0.0001, "loss": 6.3239, "loss/crossentropy": 1.9526153802871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3661540001630783, "step": 490 }, { "epoch": 0.00984, "grad_norm": 4.875, "grad_norm_var": 0.19999593098958332, "learning_rate": 0.0001, "loss": 6.7642, "loss/crossentropy": 2.40561842918396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44370119273662567, "step": 492 }, { "epoch": 0.00988, "grad_norm": 4.53125, "grad_norm_var": 0.1943359375, "learning_rate": 0.0001, "loss": 6.6475, "loss/crossentropy": 2.4316108226776123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4437306672334671, "step": 494 }, { "epoch": 0.00992, "grad_norm": 4.53125, "grad_norm_var": 0.25276285807291665, "learning_rate": 0.0001, "loss": 6.4095, "loss/crossentropy": 2.2919591665267944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4170967787504196, "step": 496 }, { "epoch": 0.00996, "grad_norm": 5.03125, "grad_norm_var": 0.21116129557291666, "learning_rate": 0.0001, "loss": 6.6291, "loss/crossentropy": 2.571357250213623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4073399156332016, "step": 498 }, { "epoch": 0.01, "grad_norm": 4.375, "grad_norm_var": 0.21330973307291667, "learning_rate": 0.0001, "loss": 6.2903, "loss/crossentropy": 2.389290928840637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41799379885196686, "step": 500 }, { "epoch": 0.01004, "grad_norm": 4.90625, "grad_norm_var": 0.20245768229166666, "learning_rate": 0.0001, "loss": 6.4319, "loss/crossentropy": 2.0904359221458435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3914492577314377, "step": 502 }, { "epoch": 0.01008, "grad_norm": 3.625, "grad_norm_var": 0.218212890625, "learning_rate": 0.0001, "loss": 6.5581, "loss/crossentropy": 2.5435129404067993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4660491645336151, "step": 504 }, { "epoch": 0.01012, "grad_norm": 4.75, "grad_norm_var": 0.21458333333333332, "learning_rate": 0.0001, "loss": 6.3665, "loss/crossentropy": 2.039083242416382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38188865780830383, "step": 506 }, { "epoch": 0.01016, "grad_norm": 5.21875, "grad_norm_var": 0.361181640625, "learning_rate": 0.0001, "loss": 6.4862, "loss/crossentropy": 2.056805729866028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4257620573043823, "step": 508 }, { "epoch": 0.0102, "grad_norm": 4.03125, "grad_norm_var": 0.38592122395833334, "learning_rate": 0.0001, "loss": 6.4822, "loss/crossentropy": 2.6178410053253174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43898941576480865, "step": 510 }, { "epoch": 0.01024, "grad_norm": 4.59375, "grad_norm_var": 0.3405558268229167, "learning_rate": 0.0001, "loss": 6.4638, "loss/crossentropy": 2.232435703277588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37769296765327454, "step": 512 }, { "epoch": 0.01028, "grad_norm": 4.40625, "grad_norm_var": 0.33033447265625, "learning_rate": 0.0001, "loss": 6.378, "loss/crossentropy": 1.8679735660552979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3805970698595047, "step": 514 }, { "epoch": 0.01032, "grad_norm": 4.3125, "grad_norm_var": 0.33175455729166664, "learning_rate": 0.0001, "loss": 6.7453, "loss/crossentropy": 2.6537472009658813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4450981914997101, "step": 516 }, { "epoch": 0.01036, "grad_norm": 4.03125, "grad_norm_var": 0.34915262858072915, "learning_rate": 0.0001, "loss": 6.2745, "loss/crossentropy": 2.5199841260910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4748214781284332, "step": 518 }, { "epoch": 0.0104, "grad_norm": 5.5, "grad_norm_var": 0.3482004801432292, "learning_rate": 0.0001, "loss": 6.6212, "loss/crossentropy": 2.6603333950042725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4144483357667923, "step": 520 }, { "epoch": 0.01044, "grad_norm": 4.28125, "grad_norm_var": 0.3241119384765625, "learning_rate": 0.0001, "loss": 6.4073, "loss/crossentropy": 2.284039616584778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42801250517368317, "step": 522 }, { "epoch": 0.01048, "grad_norm": 3.78125, "grad_norm_var": 0.1894683837890625, "learning_rate": 0.0001, "loss": 6.2153, "loss/crossentropy": 2.473629951477051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4107852131128311, "step": 524 }, { "epoch": 0.01052, "grad_norm": 4.5, "grad_norm_var": 0.1827789306640625, "learning_rate": 0.0001, "loss": 6.6746, "loss/crossentropy": 2.1443774700164795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3606857359409332, "step": 526 }, { "epoch": 0.01056, "grad_norm": 3.96875, "grad_norm_var": 0.18889058430989583, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 1.8425135016441345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37770508229732513, "step": 528 }, { "epoch": 0.0106, "grad_norm": 3.859375, "grad_norm_var": 0.179052734375, "learning_rate": 0.0001, "loss": 6.3319, "loss/crossentropy": 2.3705164194107056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4354119151830673, "step": 530 }, { "epoch": 0.01064, "grad_norm": 4.53125, "grad_norm_var": 0.16344401041666667, "learning_rate": 0.0001, "loss": 6.4823, "loss/crossentropy": 2.1141316294670105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34213581681251526, "step": 532 }, { "epoch": 0.01068, "grad_norm": 4.3125, "grad_norm_var": 0.15579325358072918, "learning_rate": 0.0001, "loss": 6.0454, "loss/crossentropy": 2.14878511428833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3756616413593292, "step": 534 }, { "epoch": 0.01072, "grad_norm": 4.34375, "grad_norm_var": 0.052155558268229166, "learning_rate": 0.0001, "loss": 6.4363, "loss/crossentropy": 2.2513046264648438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.416190966963768, "step": 536 }, { "epoch": 0.01076, "grad_norm": 3.921875, "grad_norm_var": 0.049169921875, "learning_rate": 0.0001, "loss": 6.2674, "loss/crossentropy": 2.1337047815322876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3839789927005768, "step": 538 }, { "epoch": 0.0108, "grad_norm": 5.8125, "grad_norm_var": 0.20488993326822916, "learning_rate": 0.0001, "loss": 6.0559, "loss/crossentropy": 2.2019962072372437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3806414008140564, "step": 540 }, { "epoch": 0.01084, "grad_norm": 4.125, "grad_norm_var": 0.2133697509765625, "learning_rate": 0.0001, "loss": 5.8434, "loss/crossentropy": 2.113224983215332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797851800918579, "step": 542 }, { "epoch": 0.01088, "grad_norm": 4.0, "grad_norm_var": 0.21507059733072917, "learning_rate": 0.0001, "loss": 6.5131, "loss/crossentropy": 2.461037516593933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41689516603946686, "step": 544 }, { "epoch": 0.01092, "grad_norm": 4.625, "grad_norm_var": 0.24849853515625, "learning_rate": 0.0001, "loss": 6.4191, "loss/crossentropy": 2.277098298072815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4108494818210602, "step": 546 }, { "epoch": 0.01096, "grad_norm": 4.34375, "grad_norm_var": 0.48818359375, "learning_rate": 0.0001, "loss": 6.3436, "loss/crossentropy": 2.007936477661133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36796192824840546, "step": 548 }, { "epoch": 0.011, "grad_norm": 4.625, "grad_norm_var": 0.5631795247395833, "learning_rate": 0.0001, "loss": 6.5793, "loss/crossentropy": 2.197320520877838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38722094893455505, "step": 550 }, { "epoch": 0.01104, "grad_norm": 4.6875, "grad_norm_var": 0.5419230143229167, "learning_rate": 0.0001, "loss": 6.3185, "loss/crossentropy": 2.225432515144348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3928917348384857, "step": 552 }, { "epoch": 0.01108, "grad_norm": 4.28125, "grad_norm_var": 0.5455067952473959, "learning_rate": 0.0001, "loss": 5.9686, "loss/crossentropy": 2.5253326892852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41547106206417084, "step": 554 }, { "epoch": 0.01112, "grad_norm": 4.15625, "grad_norm_var": 0.44996337890625, "learning_rate": 0.0001, "loss": 6.2158, "loss/crossentropy": 2.488635540008545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4523312896490097, "step": 556 }, { "epoch": 0.01116, "grad_norm": 4.03125, "grad_norm_var": 0.46119384765625, "learning_rate": 0.0001, "loss": 6.3632, "loss/crossentropy": 2.395568609237671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40695953369140625, "step": 558 }, { "epoch": 0.0112, "grad_norm": 4.28125, "grad_norm_var": 0.4615234375, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.383823275566101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4482497274875641, "step": 560 }, { "epoch": 0.01124, "grad_norm": 4.46875, "grad_norm_var": 0.46280008951822915, "learning_rate": 0.0001, "loss": 5.968, "loss/crossentropy": 2.0261693000793457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36933301389217377, "step": 562 }, { "epoch": 0.01128, "grad_norm": 4.65625, "grad_norm_var": 0.2221588134765625, "learning_rate": 0.0001, "loss": 6.1467, "loss/crossentropy": 2.131769895553589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.341948002576828, "step": 564 }, { "epoch": 0.01132, "grad_norm": 4.8125, "grad_norm_var": 0.0982574462890625, "learning_rate": 0.0001, "loss": 6.5288, "loss/crossentropy": 2.3899158239364624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.417646586894989, "step": 566 }, { "epoch": 0.01136, "grad_norm": 4.21875, "grad_norm_var": 0.07683817545572917, "learning_rate": 0.0001, "loss": 6.6198, "loss/crossentropy": 2.3139528036117554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39010919630527496, "step": 568 }, { "epoch": 0.0114, "grad_norm": 4.3125, "grad_norm_var": 0.06782124837239584, "learning_rate": 0.0001, "loss": 6.1209, "loss/crossentropy": 1.966201364994049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3554569333791733, "step": 570 }, { "epoch": 0.01144, "grad_norm": 4.21875, "grad_norm_var": 0.07773335774739583, "learning_rate": 0.0001, "loss": 6.1746, "loss/crossentropy": 2.2325466871261597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4008040726184845, "step": 572 }, { "epoch": 0.01148, "grad_norm": 4.96875, "grad_norm_var": 0.09848531087239583, "learning_rate": 0.0001, "loss": 6.121, "loss/crossentropy": 1.7670194506645203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3362845778465271, "step": 574 }, { "epoch": 0.01152, "grad_norm": 4.125, "grad_norm_var": 0.10274149576822916, "learning_rate": 0.0001, "loss": 6.3956, "loss/crossentropy": 2.332284092903137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4190225303173065, "step": 576 }, { "epoch": 0.01156, "grad_norm": 4.71875, "grad_norm_var": 0.10549723307291667, "learning_rate": 0.0001, "loss": 6.4367, "loss/crossentropy": 2.265386700630188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4109695851802826, "step": 578 }, { "epoch": 0.0116, "grad_norm": 5.15625, "grad_norm_var": 0.13811442057291667, "learning_rate": 0.0001, "loss": 6.4475, "loss/crossentropy": 2.265889286994934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42450472712516785, "step": 580 }, { "epoch": 0.01164, "grad_norm": 4.5625, "grad_norm_var": 0.163134765625, "learning_rate": 0.0001, "loss": 6.1483, "loss/crossentropy": 1.867847204208374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3627365529537201, "step": 582 }, { "epoch": 0.01168, "grad_norm": 3.796875, "grad_norm_var": 0.23681538899739582, "learning_rate": 0.0001, "loss": 6.5909, "loss/crossentropy": 2.5827555656433105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5194894820451736, "step": 584 }, { "epoch": 0.01172, "grad_norm": 4.5625, "grad_norm_var": 0.23188374837239584, "learning_rate": 0.0001, "loss": 6.6319, "loss/crossentropy": 2.26031893491745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.412450835108757, "step": 586 }, { "epoch": 0.01176, "grad_norm": 5.53125, "grad_norm_var": 0.2718739827473958, "learning_rate": 0.0001, "loss": 6.5782, "loss/crossentropy": 2.1885104179382324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41636165976524353, "step": 588 }, { "epoch": 0.0118, "grad_norm": 4.53125, "grad_norm_var": 0.26240132649739584, "learning_rate": 0.0001, "loss": 6.5247, "loss/crossentropy": 2.682767391204834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4279082715511322, "step": 590 }, { "epoch": 0.01184, "grad_norm": 4.3125, "grad_norm_var": 0.24807027180989583, "learning_rate": 0.0001, "loss": 6.5862, "loss/crossentropy": 2.185304641723633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4266812950372696, "step": 592 }, { "epoch": 0.01188, "grad_norm": 4.15625, "grad_norm_var": 0.2589752197265625, "learning_rate": 0.0001, "loss": 6.2494, "loss/crossentropy": 2.383716344833374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38570962846279144, "step": 594 }, { "epoch": 0.01192, "grad_norm": 4.3125, "grad_norm_var": 0.2337066650390625, "learning_rate": 0.0001, "loss": 6.3066, "loss/crossentropy": 2.2963398694992065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43451687693595886, "step": 596 }, { "epoch": 0.01196, "grad_norm": 4.96875, "grad_norm_var": 0.2128570556640625, "learning_rate": 0.0001, "loss": 6.3368, "loss/crossentropy": 2.2728757858276367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4280686676502228, "step": 598 }, { "epoch": 0.012, "grad_norm": 4.34375, "grad_norm_var": 0.141650390625, "learning_rate": 0.0001, "loss": 6.1147, "loss/crossentropy": 2.3486615419387817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37224848568439484, "step": 600 }, { "epoch": 0.01204, "grad_norm": 3.765625, "grad_norm_var": 0.17512613932291668, "learning_rate": 0.0001, "loss": 6.4017, "loss/crossentropy": 2.3265292644500732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4079990088939667, "step": 602 }, { "epoch": 0.01208, "grad_norm": 3.84375, "grad_norm_var": 0.08847554524739583, "learning_rate": 0.0001, "loss": 6.0861, "loss/crossentropy": 2.45253849029541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42630523443222046, "step": 604 }, { "epoch": 0.01212, "grad_norm": 4.03125, "grad_norm_var": 0.08964742024739583, "learning_rate": 0.0001, "loss": 6.4101, "loss/crossentropy": 2.4417446851730347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40342913568019867, "step": 606 }, { "epoch": 0.01216, "grad_norm": 4.25, "grad_norm_var": 0.08886617024739583, "learning_rate": 0.0001, "loss": 6.2513, "loss/crossentropy": 2.1483529210090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3846609443426132, "step": 608 }, { "epoch": 0.0122, "grad_norm": 4.1875, "grad_norm_var": 0.09990132649739583, "learning_rate": 0.0001, "loss": 6.5479, "loss/crossentropy": 2.481536865234375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.440000057220459, "step": 610 }, { "epoch": 0.01224, "grad_norm": 4.15625, "grad_norm_var": 0.09464518229166667, "learning_rate": 0.0001, "loss": 6.4986, "loss/crossentropy": 2.4472655057907104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41242682933807373, "step": 612 }, { "epoch": 0.01228, "grad_norm": 4.09375, "grad_norm_var": 0.07828369140625, "learning_rate": 0.0001, "loss": 6.4348, "loss/crossentropy": 2.3511135578155518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38262398540973663, "step": 614 }, { "epoch": 0.01232, "grad_norm": 3.71875, "grad_norm_var": 0.085791015625, "learning_rate": 0.0001, "loss": 6.4821, "loss/crossentropy": 2.5090683698654175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.44584622979164124, "step": 616 }, { "epoch": 0.01236, "grad_norm": 4.75, "grad_norm_var": 0.1117340087890625, "learning_rate": 0.0001, "loss": 6.0395, "loss/crossentropy": 2.166012942790985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40271493792533875, "step": 618 }, { "epoch": 0.0124, "grad_norm": 5.09375, "grad_norm_var": 0.15511067708333334, "learning_rate": 0.0001, "loss": 6.4644, "loss/crossentropy": 2.583309531211853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43380285799503326, "step": 620 }, { "epoch": 0.01244, "grad_norm": 3.984375, "grad_norm_var": 0.19877827962239583, "learning_rate": 0.0001, "loss": 6.3289, "loss/crossentropy": 2.125720262527466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40389589965343475, "step": 622 }, { "epoch": 0.01248, "grad_norm": 6.34375, "grad_norm_var": 0.4984283447265625, "learning_rate": 0.0001, "loss": 5.9651, "loss/crossentropy": 1.7034094333648682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31466029584407806, "step": 624 }, { "epoch": 0.01252, "grad_norm": 4.40625, "grad_norm_var": 0.4901041666666667, "learning_rate": 0.0001, "loss": 5.9451, "loss/crossentropy": 2.163281202316284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38622182607650757, "step": 626 }, { "epoch": 0.01256, "grad_norm": 3.875, "grad_norm_var": 0.4982818603515625, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 1.7754453420639038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33256995677948, "step": 628 }, { "epoch": 0.0126, "grad_norm": 4.09375, "grad_norm_var": 0.4894683837890625, "learning_rate": 0.0001, "loss": 6.8606, "loss/crossentropy": 2.273309350013733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5319447964429855, "step": 630 }, { "epoch": 0.01264, "grad_norm": 3.828125, "grad_norm_var": 0.480126953125, "learning_rate": 0.0001, "loss": 6.2103, "loss/crossentropy": 2.397401988506317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4255771040916443, "step": 632 }, { "epoch": 0.01268, "grad_norm": 5.28125, "grad_norm_var": 0.521875, "learning_rate": 0.0001, "loss": 6.4179, "loss/crossentropy": 2.3898611068725586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.46314406394958496, "step": 634 }, { "epoch": 0.01272, "grad_norm": 7.40625, "grad_norm_var": 1.0609212239583334, "learning_rate": 0.0001, "loss": 6.5634, "loss/crossentropy": 2.3740471601486206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.48123428225517273, "step": 636 }, { "epoch": 0.01276, "grad_norm": 3.90625, "grad_norm_var": 1.0519846598307292, "learning_rate": 0.0001, "loss": 6.1136, "loss/crossentropy": 2.236217498779297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39564159512519836, "step": 638 }, { "epoch": 0.0128, "grad_norm": 4.21875, "grad_norm_var": 0.7972157796223959, "learning_rate": 0.0001, "loss": 6.3134, "loss/crossentropy": 2.4049174785614014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41312308609485626, "step": 640 }, { "epoch": 0.01284, "grad_norm": 4.0, "grad_norm_var": 0.80142822265625, "learning_rate": 0.0001, "loss": 6.2726, "loss/crossentropy": 2.173800766468048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3983110189437866, "step": 642 }, { "epoch": 0.01288, "grad_norm": 4.65625, "grad_norm_var": 0.80084228515625, "learning_rate": 0.0001, "loss": 6.2779, "loss/crossentropy": 2.2124537229537964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38358184695243835, "step": 644 }, { "epoch": 0.01292, "grad_norm": 3.90625, "grad_norm_var": 0.8355377197265625, "learning_rate": 0.0001, "loss": 6.1209, "loss/crossentropy": 2.4939264059066772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3805152475833893, "step": 646 }, { "epoch": 0.01296, "grad_norm": 4.25, "grad_norm_var": 0.8250935872395834, "learning_rate": 0.0001, "loss": 6.0714, "loss/crossentropy": 2.4526472091674805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3958089202642441, "step": 648 }, { "epoch": 0.013, "grad_norm": 4.25, "grad_norm_var": 0.7562978108723958, "learning_rate": 0.0001, "loss": 6.3648, "loss/crossentropy": 2.5171029567718506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.446771502494812, "step": 650 }, { "epoch": 0.01304, "grad_norm": 4.125, "grad_norm_var": 0.1185943603515625, "learning_rate": 0.0001, "loss": 6.1656, "loss/crossentropy": 2.270598888397217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38787929713726044, "step": 652 }, { "epoch": 0.01308, "grad_norm": 4.15625, "grad_norm_var": 0.11404520670572917, "learning_rate": 0.0001, "loss": 5.7739, "loss/crossentropy": 1.8847617506980896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3258303850889206, "step": 654 }, { "epoch": 0.01312, "grad_norm": 3.9375, "grad_norm_var": 0.11286519368489584, "learning_rate": 0.0001, "loss": 6.042, "loss/crossentropy": 2.2471452951431274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38874460756778717, "step": 656 }, { "epoch": 0.01316, "grad_norm": 4.125, "grad_norm_var": 0.10369364420572917, "learning_rate": 0.0001, "loss": 6.4383, "loss/crossentropy": 2.3776252269744873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40553848445415497, "step": 658 }, { "epoch": 0.0132, "grad_norm": 3.84375, "grad_norm_var": 0.06461588541666667, "learning_rate": 0.0001, "loss": 5.5389, "loss/crossentropy": 2.291012167930603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3630271404981613, "step": 660 }, { "epoch": 0.01324, "grad_norm": 3.90625, "grad_norm_var": 0.06116536458333333, "learning_rate": 0.0001, "loss": 6.298, "loss/crossentropy": 2.2029112577438354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37700483202934265, "step": 662 }, { "epoch": 0.01328, "grad_norm": 3.984375, "grad_norm_var": 0.0598541259765625, "learning_rate": 0.0001, "loss": 6.4093, "loss/crossentropy": 2.571411967277527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.5071892440319061, "step": 664 }, { "epoch": 0.01332, "grad_norm": 3.484375, "grad_norm_var": 0.045947265625, "learning_rate": 0.0001, "loss": 5.6839, "loss/crossentropy": 2.148792862892151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3591457009315491, "step": 666 }, { "epoch": 0.01336, "grad_norm": 4.09375, "grad_norm_var": 0.04755452473958333, "learning_rate": 0.0001, "loss": 6.4444, "loss/crossentropy": 2.5091140270233154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3882133811712265, "step": 668 }, { "epoch": 0.0134, "grad_norm": 4.09375, "grad_norm_var": 0.049332682291666666, "learning_rate": 0.0001, "loss": 6.15, "loss/crossentropy": 2.4669524431228638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3941466957330704, "step": 670 }, { "epoch": 0.01344, "grad_norm": 3.9375, "grad_norm_var": 0.06620686848958333, "learning_rate": 0.0001, "loss": 6.5358, "loss/crossentropy": 2.4111422300338745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3973373472690582, "step": 672 }, { "epoch": 0.01348, "grad_norm": 4.1875, "grad_norm_var": 0.06495768229166667, "learning_rate": 0.0001, "loss": 5.765, "loss/crossentropy": 2.1109864115715027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36189010739326477, "step": 674 }, { "epoch": 0.01352, "grad_norm": 3.578125, "grad_norm_var": 0.0744140625, "learning_rate": 0.0001, "loss": 6.0289, "loss/crossentropy": 2.069494664669037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3562029302120209, "step": 676 }, { "epoch": 0.01356, "grad_norm": 4.0625, "grad_norm_var": 0.07224934895833333, "learning_rate": 0.0001, "loss": 6.4526, "loss/crossentropy": 2.1924527883529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38948506116867065, "step": 678 }, { "epoch": 0.0136, "grad_norm": 3.6875, "grad_norm_var": 0.082421875, "learning_rate": 0.0001, "loss": 5.7311, "loss/crossentropy": 2.1603400707244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40414859354496, "step": 680 }, { "epoch": 0.01364, "grad_norm": 3.71875, "grad_norm_var": 0.07177632649739583, "learning_rate": 0.0001, "loss": 6.2459, "loss/crossentropy": 2.515262722969055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4363710880279541, "step": 682 }, { "epoch": 0.01368, "grad_norm": 4.09375, "grad_norm_var": 0.07869364420572916, "learning_rate": 0.0001, "loss": 6.1174, "loss/crossentropy": 2.3615161180496216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35220713913440704, "step": 684 }, { "epoch": 0.01372, "grad_norm": 6.03125, "grad_norm_var": 0.3293690999348958, "learning_rate": 0.0001, "loss": 6.1941, "loss/crossentropy": 1.920493245124817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35293935239315033, "step": 686 }, { "epoch": 0.01376, "grad_norm": 4.03125, "grad_norm_var": 0.31579488118489585, "learning_rate": 0.0001, "loss": 6.4064, "loss/crossentropy": 2.493665337562561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4243907481431961, "step": 688 }, { "epoch": 0.0138, "grad_norm": 4.03125, "grad_norm_var": 0.3451324462890625, "learning_rate": 0.0001, "loss": 6.1519, "loss/crossentropy": 2.1182271242141724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39624081552028656, "step": 690 }, { "epoch": 0.01384, "grad_norm": 4.09375, "grad_norm_var": 0.32034098307291664, "learning_rate": 0.0001, "loss": 6.2457, "loss/crossentropy": 2.146227180957794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3491668850183487, "step": 692 }, { "epoch": 0.01388, "grad_norm": 3.734375, "grad_norm_var": 0.3376261393229167, "learning_rate": 0.0001, "loss": 5.7331, "loss/crossentropy": 1.8458876609802246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34608474373817444, "step": 694 }, { "epoch": 0.01392, "grad_norm": 4.03125, "grad_norm_var": 0.3294911702473958, "learning_rate": 0.0001, "loss": 6.3661, "loss/crossentropy": 2.270371675491333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33937984704971313, "step": 696 }, { "epoch": 0.01396, "grad_norm": 3.875, "grad_norm_var": 0.32066141764322914, "learning_rate": 0.0001, "loss": 5.8264, "loss/crossentropy": 2.040702223777771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3821101486682892, "step": 698 }, { "epoch": 0.014, "grad_norm": 4.09375, "grad_norm_var": 0.3197428385416667, "learning_rate": 0.0001, "loss": 6.1216, "loss/crossentropy": 2.1711822152137756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35383065044879913, "step": 700 }, { "epoch": 0.01404, "grad_norm": 3.890625, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 5.8384, "loss/crossentropy": 2.3292651176452637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37580642104148865, "step": 702 }, { "epoch": 0.01408, "grad_norm": 4.34375, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 6.0789, "loss/crossentropy": 2.243735432624817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3997037708759308, "step": 704 }, { "epoch": 0.01412, "grad_norm": 3.921875, "grad_norm_var": 0.0524566650390625, "learning_rate": 0.0001, "loss": 5.9987, "loss/crossentropy": 1.9908145666122437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33391132950782776, "step": 706 }, { "epoch": 0.01416, "grad_norm": 4.375, "grad_norm_var": 0.06243082682291667, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.280096471309662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3899015784263611, "step": 708 }, { "epoch": 0.0142, "grad_norm": 3.6875, "grad_norm_var": 0.12727762858072916, "learning_rate": 0.0001, "loss": 5.9373, "loss/crossentropy": 2.0714810490608215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797626197338104, "step": 710 }, { "epoch": 0.01424, "grad_norm": 3.875, "grad_norm_var": 0.14480692545572918, "learning_rate": 0.0001, "loss": 6.2466, "loss/crossentropy": 2.250674605369568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3948116898536682, "step": 712 }, { "epoch": 0.01428, "grad_norm": 3.65625, "grad_norm_var": 0.15719401041666667, "learning_rate": 0.0001, "loss": 5.8891, "loss/crossentropy": 2.0895228385925293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34471337497234344, "step": 714 }, { "epoch": 0.01432, "grad_norm": 4.71875, "grad_norm_var": 0.1826812744140625, "learning_rate": 0.0001, "loss": 6.3748, "loss/crossentropy": 2.337170124053955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4226381927728653, "step": 716 }, { "epoch": 0.01436, "grad_norm": 4.03125, "grad_norm_var": 0.174462890625, "learning_rate": 0.0001, "loss": 6.3996, "loss/crossentropy": 2.337436556816101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3738469183444977, "step": 718 }, { "epoch": 0.0144, "grad_norm": 4.15625, "grad_norm_var": 0.1669921875, "learning_rate": 0.0001, "loss": 6.0278, "loss/crossentropy": 1.9506489634513855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37431904673576355, "step": 720 }, { "epoch": 0.01444, "grad_norm": 3.71875, "grad_norm_var": 0.17283528645833332, "learning_rate": 0.0001, "loss": 5.8083, "loss/crossentropy": 2.253044009208679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36500048637390137, "step": 722 }, { "epoch": 0.01448, "grad_norm": 5.15625, "grad_norm_var": 0.23413798014322917, "learning_rate": 0.0001, "loss": 6.2307, "loss/crossentropy": 1.9980219006538391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34674490988254547, "step": 724 }, { "epoch": 0.01452, "grad_norm": 3.609375, "grad_norm_var": 0.1903472900390625, "learning_rate": 0.0001, "loss": 5.8283, "loss/crossentropy": 1.894662618637085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34921175241470337, "step": 726 }, { "epoch": 0.01456, "grad_norm": 3.609375, "grad_norm_var": 0.18310546875, "learning_rate": 0.0001, "loss": 6.1117, "loss/crossentropy": 2.304685115814209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36858032643795013, "step": 728 }, { "epoch": 0.0146, "grad_norm": 3.6875, "grad_norm_var": 0.17696940104166667, "learning_rate": 0.0001, "loss": 6.1041, "loss/crossentropy": 1.9020891189575195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3854057639837265, "step": 730 }, { "epoch": 0.01464, "grad_norm": 3.4375, "grad_norm_var": 0.15485026041666666, "learning_rate": 0.0001, "loss": 5.5327, "loss/crossentropy": 1.707019329071045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2961048036813736, "step": 732 }, { "epoch": 0.01468, "grad_norm": 3.828125, "grad_norm_var": 0.15344136555989582, "learning_rate": 0.0001, "loss": 6.0543, "loss/crossentropy": 2.423463463783264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3739102631807327, "step": 734 }, { "epoch": 0.01472, "grad_norm": 4.0625, "grad_norm_var": 0.15335286458333333, "learning_rate": 0.0001, "loss": 6.2855, "loss/crossentropy": 2.0490055680274963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4027387350797653, "step": 736 }, { "epoch": 0.01476, "grad_norm": 3.984375, "grad_norm_var": 0.15038960774739582, "learning_rate": 0.0001, "loss": 6.1236, "loss/crossentropy": 2.3712635040283203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3602859079837799, "step": 738 }, { "epoch": 0.0148, "grad_norm": 3.75, "grad_norm_var": 0.058649698893229164, "learning_rate": 0.0001, "loss": 6.2938, "loss/crossentropy": 2.306379556655884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38698625564575195, "step": 740 }, { "epoch": 0.01484, "grad_norm": 3.984375, "grad_norm_var": 0.055939737955729166, "learning_rate": 0.0001, "loss": 6.3983, "loss/crossentropy": 2.6846178770065308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3881431221961975, "step": 742 }, { "epoch": 0.01488, "grad_norm": 3.9375, "grad_norm_var": 0.058690388997395836, "learning_rate": 0.0001, "loss": 5.7984, "loss/crossentropy": 2.0555977821350098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36078818142414093, "step": 744 }, { "epoch": 0.01492, "grad_norm": 4.09375, "grad_norm_var": 0.059325154622395834, "learning_rate": 0.0001, "loss": 5.7834, "loss/crossentropy": 2.0597304701805115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34516778588294983, "step": 746 }, { "epoch": 0.01496, "grad_norm": 3.5625, "grad_norm_var": 0.049479166666666664, "learning_rate": 0.0001, "loss": 5.8911, "loss/crossentropy": 1.9607329964637756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33818933367729187, "step": 748 }, { "epoch": 0.015, "grad_norm": 3.546875, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 5.8484, "loss/crossentropy": 1.7854246497154236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3463115990161896, "step": 750 }, { "epoch": 0.01504, "grad_norm": 3.703125, "grad_norm_var": 0.0590240478515625, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 1.845078468322754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3714388310909271, "step": 752 }, { "epoch": 0.01508, "grad_norm": 3.9375, "grad_norm_var": 0.05771077473958333, "learning_rate": 0.0001, "loss": 6.1677, "loss/crossentropy": 2.3951027393341064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3723580837249756, "step": 754 }, { "epoch": 0.01512, "grad_norm": 3.6875, "grad_norm_var": 0.032486979166666666, "learning_rate": 0.0001, "loss": 6.219, "loss/crossentropy": 2.500870108604431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42450079321861267, "step": 756 }, { "epoch": 0.01516, "grad_norm": 3.75, "grad_norm_var": 0.030060831705729166, "learning_rate": 0.0001, "loss": 5.886, "loss/crossentropy": 2.1584274768829346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3798908591270447, "step": 758 }, { "epoch": 0.0152, "grad_norm": 3.5625, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 5.7937, "loss/crossentropy": 2.3126983642578125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33131279051303864, "step": 760 }, { "epoch": 0.01524, "grad_norm": 3.5, "grad_norm_var": 0.020246378580729165, "learning_rate": 0.0001, "loss": 5.9182, "loss/crossentropy": 2.349764347076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3910932093858719, "step": 762 }, { "epoch": 0.01528, "grad_norm": 3.640625, "grad_norm_var": 0.0224273681640625, "learning_rate": 0.0001, "loss": 6.0472, "loss/crossentropy": 2.2232795357704163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3483322113752365, "step": 764 }, { "epoch": 0.01532, "grad_norm": 3.65625, "grad_norm_var": 0.0207427978515625, "learning_rate": 0.0001, "loss": 6.0312, "loss/crossentropy": 2.2273412942886353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38545119762420654, "step": 766 }, { "epoch": 0.01536, "grad_norm": 4.125, "grad_norm_var": 0.03355712890625, "learning_rate": 0.0001, "loss": 6.0523, "loss/crossentropy": 2.5879149436950684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38191574811935425, "step": 768 }, { "epoch": 0.0154, "grad_norm": 3.953125, "grad_norm_var": 0.034032185872395836, "learning_rate": 0.0001, "loss": 6.027, "loss/crossentropy": 2.3305420875549316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3704134076833725, "step": 770 }, { "epoch": 0.01544, "grad_norm": 3.59375, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 6.121, "loss/crossentropy": 2.0433666706085205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34387587010860443, "step": 772 }, { "epoch": 0.01548, "grad_norm": 3.609375, "grad_norm_var": 0.03337300618489583, "learning_rate": 0.0001, "loss": 5.5837, "loss/crossentropy": 2.1127337217330933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3484109789133072, "step": 774 }, { "epoch": 0.01552, "grad_norm": 3.859375, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 6.0028, "loss/crossentropy": 2.1637459993362427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3757011145353317, "step": 776 }, { "epoch": 0.01556, "grad_norm": 3.734375, "grad_norm_var": 0.024446614583333335, "learning_rate": 0.0001, "loss": 5.9904, "loss/crossentropy": 2.4118471145629883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797076344490051, "step": 778 }, { "epoch": 0.0156, "grad_norm": 3.828125, "grad_norm_var": 0.022191365559895832, "learning_rate": 0.0001, "loss": 6.2649, "loss/crossentropy": 1.9410768151283264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3254295587539673, "step": 780 }, { "epoch": 0.01564, "grad_norm": 3.609375, "grad_norm_var": 0.023558553059895834, "learning_rate": 0.0001, "loss": 5.9008, "loss/crossentropy": 2.1669737100601196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3583529591560364, "step": 782 }, { "epoch": 0.01568, "grad_norm": 3.5625, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 5.9868, "loss/crossentropy": 2.217113733291626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3503105044364929, "step": 784 }, { "epoch": 0.01572, "grad_norm": 3.84375, "grad_norm_var": 0.017154947916666666, "learning_rate": 0.0001, "loss": 6.0695, "loss/crossentropy": 2.588438868522644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3941201716661453, "step": 786 }, { "epoch": 0.01576, "grad_norm": 3.546875, "grad_norm_var": 0.018310546875, "learning_rate": 0.0001, "loss": 5.9436, "loss/crossentropy": 2.3925808668136597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3755808621644974, "step": 788 }, { "epoch": 0.0158, "grad_norm": 3.625, "grad_norm_var": 0.018684895833333333, "learning_rate": 0.0001, "loss": 5.7254, "loss/crossentropy": 1.9568504691123962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3865346759557724, "step": 790 }, { "epoch": 0.01584, "grad_norm": 3.96875, "grad_norm_var": 0.024104817708333334, "learning_rate": 0.0001, "loss": 6.0174, "loss/crossentropy": 2.336462616920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36937348544597626, "step": 792 }, { "epoch": 0.01588, "grad_norm": 3.421875, "grad_norm_var": 0.03623046875, "learning_rate": 0.0001, "loss": 5.7742, "loss/crossentropy": 2.1867082715034485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35803911089897156, "step": 794 }, { "epoch": 0.01592, "grad_norm": 3.765625, "grad_norm_var": 0.03877665201822917, "learning_rate": 0.0001, "loss": 6.2825, "loss/crossentropy": 2.070562243461609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42781224846839905, "step": 796 }, { "epoch": 0.01596, "grad_norm": 4.09375, "grad_norm_var": 0.044384765625, "learning_rate": 0.0001, "loss": 6.401, "loss/crossentropy": 2.160820960998535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34495553374290466, "step": 798 }, { "epoch": 0.016, "grad_norm": 4.0, "grad_norm_var": 0.045426432291666666, "learning_rate": 0.0001, "loss": 5.9628, "loss/crossentropy": 2.3424230813980103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4417698383331299, "step": 800 }, { "epoch": 0.01604, "grad_norm": 4.59375, "grad_norm_var": 0.08385009765625, "learning_rate": 0.0001, "loss": 6.1984, "loss/crossentropy": 2.090175747871399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35883304476737976, "step": 802 }, { "epoch": 0.01608, "grad_norm": 3.90625, "grad_norm_var": 0.0759429931640625, "learning_rate": 0.0001, "loss": 6.2044, "loss/crossentropy": 2.460660457611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36723683774471283, "step": 804 }, { "epoch": 0.01612, "grad_norm": 3.78125, "grad_norm_var": 0.0783203125, "learning_rate": 0.0001, "loss": 5.8788, "loss/crossentropy": 2.2680885791778564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3925183415412903, "step": 806 }, { "epoch": 0.01616, "grad_norm": 3.796875, "grad_norm_var": 0.10422261555989583, "learning_rate": 0.0001, "loss": 6.1179, "loss/crossentropy": 2.272566020488739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3425859659910202, "step": 808 }, { "epoch": 0.0162, "grad_norm": 3.546875, "grad_norm_var": 0.10061442057291667, "learning_rate": 0.0001, "loss": 5.8933, "loss/crossentropy": 2.2417107820510864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3634066879749298, "step": 810 }, { "epoch": 0.01624, "grad_norm": 4.09375, "grad_norm_var": 0.10075581868489583, "learning_rate": 0.0001, "loss": 5.9907, "loss/crossentropy": 2.2117987275123596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3669978231191635, "step": 812 }, { "epoch": 0.01628, "grad_norm": 4.53125, "grad_norm_var": 0.12078348795572917, "learning_rate": 0.0001, "loss": 6.1767, "loss/crossentropy": 2.3471380472183228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39207911491394043, "step": 814 }, { "epoch": 0.01632, "grad_norm": 4.1875, "grad_norm_var": 0.12200113932291666, "learning_rate": 0.0001, "loss": 6.005, "loss/crossentropy": 2.1516740322113037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3394138962030411, "step": 816 }, { "epoch": 0.01636, "grad_norm": 3.765625, "grad_norm_var": 0.10528055826822917, "learning_rate": 0.0001, "loss": 6.0827, "loss/crossentropy": 2.5085272789001465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.39268872141838074, "step": 818 }, { "epoch": 0.0164, "grad_norm": 3.515625, "grad_norm_var": 0.1198883056640625, "learning_rate": 0.0001, "loss": 6.0363, "loss/crossentropy": 2.3051916360855103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3798936903476715, "step": 820 }, { "epoch": 0.01644, "grad_norm": 3.5, "grad_norm_var": 0.14094136555989584, "learning_rate": 0.0001, "loss": 5.4403, "loss/crossentropy": 2.1685640811920166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3666132390499115, "step": 822 }, { "epoch": 0.01648, "grad_norm": 3.53125, "grad_norm_var": 0.109521484375, "learning_rate": 0.0001, "loss": 5.6981, "loss/crossentropy": 2.3374987840652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32505205273628235, "step": 824 }, { "epoch": 0.01652, "grad_norm": 5.4375, "grad_norm_var": 0.280859375, "learning_rate": 0.0001, "loss": 6.1428, "loss/crossentropy": 2.6427528858184814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4296632409095764, "step": 826 }, { "epoch": 0.01656, "grad_norm": 4.0, "grad_norm_var": 0.28609619140625, "learning_rate": 0.0001, "loss": 5.8304, "loss/crossentropy": 2.1534847617149353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37605202198028564, "step": 828 }, { "epoch": 0.0166, "grad_norm": 4.28125, "grad_norm_var": 0.2637278238932292, "learning_rate": 0.0001, "loss": 6.2115, "loss/crossentropy": 1.9642478227615356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43740569055080414, "step": 830 }, { "epoch": 0.01664, "grad_norm": 4.59375, "grad_norm_var": 3.3863433837890624, "learning_rate": 0.0001, "loss": 6.5306, "loss/crossentropy": 2.283148407936096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3901669532060623, "step": 832 }, { "epoch": 0.01668, "grad_norm": 3.46875, "grad_norm_var": 3.382255045572917, "learning_rate": 0.0001, "loss": 6.0831, "loss/crossentropy": 2.418351888656616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4011431038379669, "step": 834 }, { "epoch": 0.01672, "grad_norm": 4.03125, "grad_norm_var": 3.322565714518229, "learning_rate": 0.0001, "loss": 6.1852, "loss/crossentropy": 2.40928852558136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40596309304237366, "step": 836 }, { "epoch": 0.01676, "grad_norm": 3.34375, "grad_norm_var": 3.3059234619140625, "learning_rate": 0.0001, "loss": 5.791, "loss/crossentropy": 2.4211392402648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3640473484992981, "step": 838 }, { "epoch": 0.0168, "grad_norm": 3.484375, "grad_norm_var": 3.2890625, "learning_rate": 0.0001, "loss": 5.6553, "loss/crossentropy": 2.047215461730957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3346950262784958, "step": 840 }, { "epoch": 0.01684, "grad_norm": 3.578125, "grad_norm_var": 3.28623046875, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.020021378993988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3442958742380142, "step": 842 }, { "epoch": 0.01688, "grad_norm": 3.96875, "grad_norm_var": 3.27164306640625, "learning_rate": 0.0001, "loss": 6.1484, "loss/crossentropy": 2.1585127115249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3768642693758011, "step": 844 }, { "epoch": 0.01692, "grad_norm": 3.84375, "grad_norm_var": 3.298802693684896, "learning_rate": 0.0001, "loss": 5.8512, "loss/crossentropy": 2.2717286348342896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3644135594367981, "step": 846 }, { "epoch": 0.01696, "grad_norm": 3.828125, "grad_norm_var": 0.06539713541666667, "learning_rate": 0.0001, "loss": 5.9203, "loss/crossentropy": 2.2683321237564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3795373737812042, "step": 848 }, { "epoch": 0.017, "grad_norm": 3.1875, "grad_norm_var": 0.07066141764322917, "learning_rate": 0.0001, "loss": 5.7235, "loss/crossentropy": 2.1189464330673218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35512739419937134, "step": 850 }, { "epoch": 0.01704, "grad_norm": 3.65625, "grad_norm_var": 0.060384114583333336, "learning_rate": 0.0001, "loss": 5.6536, "loss/crossentropy": 2.260777235031128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35152101516723633, "step": 852 }, { "epoch": 0.01708, "grad_norm": 3.671875, "grad_norm_var": 0.06897379557291666, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.323577642440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32252687215805054, "step": 854 }, { "epoch": 0.01712, "grad_norm": 3.71875, "grad_norm_var": 0.07024637858072917, "learning_rate": 0.0001, "loss": 6.1006, "loss/crossentropy": 2.5965300798416138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.43394728004932404, "step": 856 }, { "epoch": 0.01716, "grad_norm": 3.140625, "grad_norm_var": 0.08170572916666667, "learning_rate": 0.0001, "loss": 5.8612, "loss/crossentropy": 2.078580856323242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3437638282775879, "step": 858 }, { "epoch": 0.0172, "grad_norm": 3.359375, "grad_norm_var": 0.08163960774739583, "learning_rate": 0.0001, "loss": 5.8394, "loss/crossentropy": 2.425456404685974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3522993326187134, "step": 860 }, { "epoch": 0.01724, "grad_norm": 3.515625, "grad_norm_var": 0.08478190104166666, "learning_rate": 0.0001, "loss": 6.0154, "loss/crossentropy": 2.2830835580825806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37188920378685, "step": 862 }, { "epoch": 0.01728, "grad_norm": 3.8125, "grad_norm_var": 0.06896870930989583, "learning_rate": 0.0001, "loss": 5.9086, "loss/crossentropy": 2.090674340724945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32989686727523804, "step": 864 }, { "epoch": 0.01732, "grad_norm": 3.8125, "grad_norm_var": 0.0698394775390625, "learning_rate": 0.0001, "loss": 5.9617, "loss/crossentropy": 2.304458498954773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37342821061611176, "step": 866 }, { "epoch": 0.01736, "grad_norm": 3.421875, "grad_norm_var": 0.0715484619140625, "learning_rate": 0.0001, "loss": 5.8593, "loss/crossentropy": 2.6545844078063965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3627774566411972, "step": 868 }, { "epoch": 0.0174, "grad_norm": 3.3125, "grad_norm_var": 0.059789021809895836, "learning_rate": 0.0001, "loss": 5.6956, "loss/crossentropy": 1.9977945685386658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.318715900182724, "step": 870 }, { "epoch": 0.01744, "grad_norm": 3.203125, "grad_norm_var": 0.08033854166666667, "learning_rate": 0.0001, "loss": 5.7408, "loss/crossentropy": 1.9226595759391785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3187277615070343, "step": 872 }, { "epoch": 0.01748, "grad_norm": 3.6875, "grad_norm_var": 0.0694732666015625, "learning_rate": 0.0001, "loss": 5.9863, "loss/crossentropy": 2.323302686214447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36724327504634857, "step": 874 }, { "epoch": 0.01752, "grad_norm": 3.5625, "grad_norm_var": 0.07043355305989583, "learning_rate": 0.0001, "loss": 5.9913, "loss/crossentropy": 2.254343032836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3444042354822159, "step": 876 }, { "epoch": 0.01756, "grad_norm": 3.296875, "grad_norm_var": 0.0774078369140625, "learning_rate": 0.0001, "loss": 5.5038, "loss/crossentropy": 2.0819836854934692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3477388769388199, "step": 878 }, { "epoch": 0.0176, "grad_norm": 3.703125, "grad_norm_var": 0.10198160807291666, "learning_rate": 0.0001, "loss": 5.9947, "loss/crossentropy": 2.377693295478821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36597058176994324, "step": 880 }, { "epoch": 0.01764, "grad_norm": 4.21875, "grad_norm_var": 0.11789449055989583, "learning_rate": 0.0001, "loss": 6.3011, "loss/crossentropy": 2.5598798990249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4082639515399933, "step": 882 }, { "epoch": 0.01768, "grad_norm": 3.6875, "grad_norm_var": 0.1294830322265625, "learning_rate": 0.0001, "loss": 5.9966, "loss/crossentropy": 2.2847843170166016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3531750440597534, "step": 884 }, { "epoch": 0.01772, "grad_norm": 3.390625, "grad_norm_var": 0.12878316243489582, "learning_rate": 0.0001, "loss": 5.6685, "loss/crossentropy": 1.8283140063285828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.336555078625679, "step": 886 }, { "epoch": 0.01776, "grad_norm": 3.65625, "grad_norm_var": 0.09971415201822917, "learning_rate": 0.0001, "loss": 5.9507, "loss/crossentropy": 2.1001436710357666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3380406051874161, "step": 888 }, { "epoch": 0.0178, "grad_norm": 4.21875, "grad_norm_var": 0.11876627604166666, "learning_rate": 0.0001, "loss": 5.7552, "loss/crossentropy": 2.0079030990600586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35362809896469116, "step": 890 }, { "epoch": 0.01784, "grad_norm": 4.46875, "grad_norm_var": 0.15650634765625, "learning_rate": 0.0001, "loss": 5.7416, "loss/crossentropy": 2.176286220550537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34369874000549316, "step": 892 }, { "epoch": 0.01788, "grad_norm": 3.984375, "grad_norm_var": 0.13087565104166668, "learning_rate": 0.0001, "loss": 5.9236, "loss/crossentropy": 2.17675244808197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3495863378047943, "step": 894 }, { "epoch": 0.01792, "grad_norm": 4.0, "grad_norm_var": 0.12189127604166666, "learning_rate": 0.0001, "loss": 5.9109, "loss/crossentropy": 2.312318801879883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40353919565677643, "step": 896 }, { "epoch": 0.01796, "grad_norm": 3.375, "grad_norm_var": 0.14010009765625, "learning_rate": 0.0001, "loss": 6.071, "loss/crossentropy": 2.28923499584198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3770768642425537, "step": 898 }, { "epoch": 0.018, "grad_norm": 3.546875, "grad_norm_var": 0.1447662353515625, "learning_rate": 0.0001, "loss": 6.0275, "loss/crossentropy": 2.2060720920562744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35431359708309174, "step": 900 }, { "epoch": 0.01804, "grad_norm": 3.296875, "grad_norm_var": 0.17552083333333332, "learning_rate": 0.0001, "loss": 5.4052, "loss/crossentropy": 2.0325206518173218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3037416934967041, "step": 902 }, { "epoch": 0.01808, "grad_norm": 3.546875, "grad_norm_var": 0.17635091145833334, "learning_rate": 0.0001, "loss": 5.9148, "loss/crossentropy": 2.1943042278289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3509814143180847, "step": 904 }, { "epoch": 0.01812, "grad_norm": 3.4375, "grad_norm_var": 0.1696197509765625, "learning_rate": 0.0001, "loss": 5.563, "loss/crossentropy": 1.9589214324951172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31621459126472473, "step": 906 }, { "epoch": 0.01816, "grad_norm": 3.375, "grad_norm_var": 0.12841389973958334, "learning_rate": 0.0001, "loss": 5.6511, "loss/crossentropy": 2.329489588737488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3837278485298157, "step": 908 }, { "epoch": 0.0182, "grad_norm": 3.5625, "grad_norm_var": 0.09648030598958333, "learning_rate": 0.0001, "loss": 5.8082, "loss/crossentropy": 2.1757726669311523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3659580200910568, "step": 910 }, { "epoch": 0.01824, "grad_norm": 3.75, "grad_norm_var": 0.08772786458333333, "learning_rate": 0.0001, "loss": 5.7372, "loss/crossentropy": 2.1498661041259766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3474857211112976, "step": 912 }, { "epoch": 0.01828, "grad_norm": 15.8125, "grad_norm_var": 9.504325358072917, "learning_rate": 0.0001, "loss": 5.9297, "loss/crossentropy": 2.4722740650177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.505307987332344, "step": 914 }, { "epoch": 0.01832, "grad_norm": 9.0, "grad_norm_var": 10.75227762858073, "learning_rate": 0.0001, "loss": 5.6296, "loss/crossentropy": 1.855428695678711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31347331404685974, "step": 916 }, { "epoch": 0.01836, "grad_norm": 3.75, "grad_norm_var": 10.50523173014323, "learning_rate": 0.0001, "loss": 5.9769, "loss/crossentropy": 2.326256275177002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3996751010417938, "step": 918 }, { "epoch": 0.0184, "grad_norm": 3.546875, "grad_norm_var": 10.518257649739583, "learning_rate": 0.0001, "loss": 5.8444, "loss/crossentropy": 2.3712844848632812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37993232905864716, "step": 920 }, { "epoch": 0.01844, "grad_norm": 3.5, "grad_norm_var": 10.657861328125, "learning_rate": 0.0001, "loss": 5.5577, "loss/crossentropy": 2.0161430835723877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3590858578681946, "step": 922 }, { "epoch": 0.01848, "grad_norm": 5.40625, "grad_norm_var": 10.520926920572917, "learning_rate": 0.0001, "loss": 5.6675, "loss/crossentropy": 2.2401121258735657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33380126953125, "step": 924 }, { "epoch": 0.01852, "grad_norm": 3.328125, "grad_norm_var": 10.598356119791667, "learning_rate": 0.0001, "loss": 5.9331, "loss/crossentropy": 2.2354423999786377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34832654893398285, "step": 926 }, { "epoch": 0.01856, "grad_norm": 3.46875, "grad_norm_var": 10.647850545247396, "learning_rate": 0.0001, "loss": 5.5212, "loss/crossentropy": 2.158566176891327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3664778769016266, "step": 928 }, { "epoch": 0.0186, "grad_norm": 3.703125, "grad_norm_var": 2.1230377197265624, "learning_rate": 0.0001, "loss": 5.9146, "loss/crossentropy": 2.270231008529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35586032271385193, "step": 930 }, { "epoch": 0.01864, "grad_norm": 3.609375, "grad_norm_var": 0.3744303385416667, "learning_rate": 0.0001, "loss": 6.0013, "loss/crossentropy": 2.233540892601013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3789799362421036, "step": 932 }, { "epoch": 0.01868, "grad_norm": 3.65625, "grad_norm_var": 0.27898763020833334, "learning_rate": 0.0001, "loss": 5.5019, "loss/crossentropy": 1.9381731152534485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28776855766773224, "step": 934 }, { "epoch": 0.01872, "grad_norm": 3.34375, "grad_norm_var": 0.28227437337239586, "learning_rate": 0.0001, "loss": 5.6759, "loss/crossentropy": 2.4225244522094727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.355392187833786, "step": 936 }, { "epoch": 0.01876, "grad_norm": 3.15625, "grad_norm_var": 0.28084208170572916, "learning_rate": 0.0001, "loss": 5.911, "loss/crossentropy": 2.58090603351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3640855699777603, "step": 938 }, { "epoch": 0.0188, "grad_norm": 3.5625, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 6.0623, "loss/crossentropy": 2.436452269554138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797999918460846, "step": 940 }, { "epoch": 0.01884, "grad_norm": 3.3125, "grad_norm_var": 0.03680013020833333, "learning_rate": 0.0001, "loss": 5.7428, "loss/crossentropy": 2.0378769636154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3278265744447708, "step": 942 }, { "epoch": 0.01888, "grad_norm": 3.546875, "grad_norm_var": 0.03860270182291667, "learning_rate": 0.0001, "loss": 5.6211, "loss/crossentropy": 2.1212962865829468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34586282074451447, "step": 944 }, { "epoch": 0.01892, "grad_norm": 3.515625, "grad_norm_var": 0.040087890625, "learning_rate": 0.0001, "loss": 5.6695, "loss/crossentropy": 2.1884353160858154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40608666837215424, "step": 946 }, { "epoch": 0.01896, "grad_norm": 3.671875, "grad_norm_var": 0.046858723958333334, "learning_rate": 0.0001, "loss": 5.6684, "loss/crossentropy": 2.2093260288238525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3519093841314316, "step": 948 }, { "epoch": 0.019, "grad_norm": 3.328125, "grad_norm_var": 0.03243815104166667, "learning_rate": 0.0001, "loss": 6.0842, "loss/crossentropy": 2.4246588945388794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37247334420681, "step": 950 }, { "epoch": 0.01904, "grad_norm": 3.671875, "grad_norm_var": 0.031022135416666666, "learning_rate": 0.0001, "loss": 5.6116, "loss/crossentropy": 1.932490050792694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32985979318618774, "step": 952 }, { "epoch": 0.01908, "grad_norm": 3.90625, "grad_norm_var": 0.7084920247395833, "learning_rate": 0.0001, "loss": 5.7393, "loss/crossentropy": 2.4439035654067993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38164034485816956, "step": 954 }, { "epoch": 0.01912, "grad_norm": 3.296875, "grad_norm_var": 0.72437744140625, "learning_rate": 0.0001, "loss": 5.6255, "loss/crossentropy": 1.8876591920852661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3267661929130554, "step": 956 }, { "epoch": 0.01916, "grad_norm": 3.578125, "grad_norm_var": 0.6990193684895833, "learning_rate": 0.0001, "loss": 5.7367, "loss/crossentropy": 2.284990072250366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34810060262680054, "step": 958 }, { "epoch": 0.0192, "grad_norm": 3.53125, "grad_norm_var": 0.6961008707682291, "learning_rate": 0.0001, "loss": 5.888, "loss/crossentropy": 2.333263397216797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.42767176032066345, "step": 960 }, { "epoch": 0.01924, "grad_norm": 3.296875, "grad_norm_var": 0.7323527018229167, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.2526148557662964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3284706473350525, "step": 962 }, { "epoch": 0.01928, "grad_norm": 4.25, "grad_norm_var": 0.7586252848307292, "learning_rate": 0.0001, "loss": 5.5479, "loss/crossentropy": 2.1782984137535095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35957905650138855, "step": 964 }, { "epoch": 0.01932, "grad_norm": 4.28125, "grad_norm_var": 0.75205078125, "learning_rate": 0.0001, "loss": 6.2783, "loss/crossentropy": 2.292098045349121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35862940549850464, "step": 966 }, { "epoch": 0.01936, "grad_norm": 3.546875, "grad_norm_var": 0.7503214518229167, "learning_rate": 0.0001, "loss": 5.9252, "loss/crossentropy": 2.102781653404236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3527261018753052, "step": 968 }, { "epoch": 0.0194, "grad_norm": 3.453125, "grad_norm_var": 0.11111653645833333, "learning_rate": 0.0001, "loss": 5.8891, "loss/crossentropy": 2.223380208015442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3608042299747467, "step": 970 }, { "epoch": 0.01944, "grad_norm": 3.4375, "grad_norm_var": 0.10501200358072917, "learning_rate": 0.0001, "loss": 5.3348, "loss/crossentropy": 2.0684096813201904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31996411085128784, "step": 972 }, { "epoch": 0.01948, "grad_norm": 4.15625, "grad_norm_var": 0.126708984375, "learning_rate": 0.0001, "loss": 5.6642, "loss/crossentropy": 2.2011090517044067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34404022991657257, "step": 974 }, { "epoch": 0.01952, "grad_norm": 3.140625, "grad_norm_var": 0.14937235514322916, "learning_rate": 0.0001, "loss": 5.5033, "loss/crossentropy": 2.027641534805298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31718479096889496, "step": 976 }, { "epoch": 0.01956, "grad_norm": 3.484375, "grad_norm_var": 0.13909403483072916, "learning_rate": 0.0001, "loss": 5.7294, "loss/crossentropy": 2.311842203140259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3904002010822296, "step": 978 }, { "epoch": 0.0196, "grad_norm": 3.859375, "grad_norm_var": 0.11236572265625, "learning_rate": 0.0001, "loss": 5.4402, "loss/crossentropy": 2.3605271577835083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38789358735084534, "step": 980 }, { "epoch": 0.01964, "grad_norm": 3.765625, "grad_norm_var": 0.095166015625, "learning_rate": 0.0001, "loss": 6.1094, "loss/crossentropy": 2.1687097549438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3534909188747406, "step": 982 }, { "epoch": 0.01968, "grad_norm": 3.6875, "grad_norm_var": 0.0962554931640625, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.194393038749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34674669802188873, "step": 984 }, { "epoch": 0.01972, "grad_norm": 3.359375, "grad_norm_var": 0.09602762858072916, "learning_rate": 0.0001, "loss": 5.5513, "loss/crossentropy": 2.1355903148651123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3180558532476425, "step": 986 }, { "epoch": 0.01976, "grad_norm": 3.390625, "grad_norm_var": 0.08559468587239584, "learning_rate": 0.0001, "loss": 5.9431, "loss/crossentropy": 2.2688111066818237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.365144744515419, "step": 988 }, { "epoch": 0.0198, "grad_norm": 3.65625, "grad_norm_var": 0.06303609212239583, "learning_rate": 0.0001, "loss": 5.5117, "loss/crossentropy": 2.423216700553894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34281550347805023, "step": 990 }, { "epoch": 0.01984, "grad_norm": 3.375, "grad_norm_var": 0.0419586181640625, "learning_rate": 0.0001, "loss": 5.4698, "loss/crossentropy": 1.9360128045082092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3343205749988556, "step": 992 }, { "epoch": 0.01988, "grad_norm": 3.421875, "grad_norm_var": 0.04468994140625, "learning_rate": 0.0001, "loss": 5.8128, "loss/crossentropy": 2.181576132774353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35567884147167206, "step": 994 }, { "epoch": 0.01992, "grad_norm": 3.578125, "grad_norm_var": 0.037398274739583334, "learning_rate": 0.0001, "loss": 5.9295, "loss/crossentropy": 2.166663408279419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34105008840560913, "step": 996 }, { "epoch": 0.01996, "grad_norm": 3.203125, "grad_norm_var": 0.027408854166666666, "learning_rate": 0.0001, "loss": 5.5579, "loss/crossentropy": 2.285332202911377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35812389850616455, "step": 998 }, { "epoch": 0.02, "grad_norm": 4.0, "grad_norm_var": 35.396484375, "learning_rate": 0.0001, "loss": 6.3986, "loss/crossentropy": 2.088365077972412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3717931807041168, "step": 1000 }, { "epoch": 0.02004, "grad_norm": 4.25, "grad_norm_var": 35.223714192708336, "learning_rate": 0.0001, "loss": 6.1327, "loss/crossentropy": 2.4051828384399414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.420933797955513, "step": 1002 }, { "epoch": 0.02008, "grad_norm": 3.265625, "grad_norm_var": 35.27108968098958, "learning_rate": 0.0001, "loss": 5.6789, "loss/crossentropy": 2.3092572689056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37571050226688385, "step": 1004 }, { "epoch": 0.02012, "grad_norm": 3.640625, "grad_norm_var": 35.29562072753906, "learning_rate": 0.0001, "loss": 5.6972, "loss/crossentropy": 2.147248387336731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29974566400051117, "step": 1006 }, { "epoch": 0.02016, "grad_norm": 3.984375, "grad_norm_var": 35.141299438476565, "learning_rate": 0.0001, "loss": 5.988, "loss/crossentropy": 2.3385868668556213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4254964739084244, "step": 1008 }, { "epoch": 0.0202, "grad_norm": 3.625, "grad_norm_var": 35.15125223795573, "learning_rate": 0.0001, "loss": 5.7553, "loss/crossentropy": 2.142681658267975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3764440715312958, "step": 1010 }, { "epoch": 0.02024, "grad_norm": 3.34375, "grad_norm_var": 35.18043619791667, "learning_rate": 0.0001, "loss": 5.5947, "loss/crossentropy": 2.241790771484375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3385400176048279, "step": 1012 }, { "epoch": 0.02028, "grad_norm": 3.859375, "grad_norm_var": 34.836360677083334, "learning_rate": 0.0001, "loss": 6.3228, "loss/crossentropy": 2.1563867330551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3954617381095886, "step": 1014 }, { "epoch": 0.02032, "grad_norm": 3.484375, "grad_norm_var": 0.09921468098958333, "learning_rate": 0.0001, "loss": 5.4613, "loss/crossentropy": 1.9462800025939941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32294341921806335, "step": 1016 }, { "epoch": 0.02036, "grad_norm": 3.53125, "grad_norm_var": 0.08026936848958334, "learning_rate": 0.0001, "loss": 5.4993, "loss/crossentropy": 1.83676278591156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3142092078924179, "step": 1018 }, { "epoch": 0.0204, "grad_norm": 3.84375, "grad_norm_var": 0.09038798014322917, "learning_rate": 0.0001, "loss": 5.8174, "loss/crossentropy": 1.951962649822235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3889008015394211, "step": 1020 }, { "epoch": 0.02044, "grad_norm": 3.578125, "grad_norm_var": 0.09976806640625, "learning_rate": 0.0001, "loss": 5.7173, "loss/crossentropy": 2.299771785736084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3214885741472244, "step": 1022 }, { "epoch": 0.02048, "grad_norm": 3.1875, "grad_norm_var": 0.09135640462239583, "learning_rate": 0.0001, "loss": 5.2291, "loss/crossentropy": 1.9117569327354431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32827669382095337, "step": 1024 }, { "epoch": 0.02052, "grad_norm": 3.1875, "grad_norm_var": 0.096533203125, "learning_rate": 0.0001, "loss": 5.7974, "loss/crossentropy": 2.484488010406494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34361426532268524, "step": 1026 }, { "epoch": 0.02056, "grad_norm": 3.59375, "grad_norm_var": 0.09973551432291666, "learning_rate": 0.0001, "loss": 5.7044, "loss/crossentropy": 2.3155311346054077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3734404444694519, "step": 1028 }, { "epoch": 0.0206, "grad_norm": 3.125, "grad_norm_var": 0.04983317057291667, "learning_rate": 0.0001, "loss": 5.4202, "loss/crossentropy": 2.081188380718231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3250262886285782, "step": 1030 }, { "epoch": 0.02064, "grad_norm": 3.09375, "grad_norm_var": 0.0574127197265625, "learning_rate": 0.0001, "loss": 5.5885, "loss/crossentropy": 2.044768512248993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34609031677246094, "step": 1032 }, { "epoch": 0.02068, "grad_norm": 4.0, "grad_norm_var": 0.0875640869140625, "learning_rate": 0.0001, "loss": 6.078, "loss/crossentropy": 2.0666560530662537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.365878626704216, "step": 1034 }, { "epoch": 0.02072, "grad_norm": 3.5625, "grad_norm_var": 0.08336588541666666, "learning_rate": 0.0001, "loss": 5.9891, "loss/crossentropy": 2.2933902740478516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3754771202802658, "step": 1036 }, { "epoch": 0.02076, "grad_norm": 3.28125, "grad_norm_var": 0.08640848795572917, "learning_rate": 0.0001, "loss": 5.8105, "loss/crossentropy": 2.28829288482666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3676798492670059, "step": 1038 }, { "epoch": 0.0208, "grad_norm": 3.5, "grad_norm_var": 0.08378499348958333, "learning_rate": 0.0001, "loss": 5.9305, "loss/crossentropy": 2.5891193151474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40094244480133057, "step": 1040 }, { "epoch": 0.02084, "grad_norm": 3.234375, "grad_norm_var": 0.08056233723958334, "learning_rate": 0.0001, "loss": 5.8579, "loss/crossentropy": 2.238967180252075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3226759731769562, "step": 1042 }, { "epoch": 0.02088, "grad_norm": 3.296875, "grad_norm_var": 0.07796223958333333, "learning_rate": 0.0001, "loss": 5.2959, "loss/crossentropy": 2.0116711258888245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3511478453874588, "step": 1044 }, { "epoch": 0.02092, "grad_norm": 3.515625, "grad_norm_var": 0.07344462076822916, "learning_rate": 0.0001, "loss": 5.4885, "loss/crossentropy": 2.4924051761627197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3375450670719147, "step": 1046 }, { "epoch": 0.02096, "grad_norm": 3.234375, "grad_norm_var": 0.059235636393229166, "learning_rate": 0.0001, "loss": 5.6838, "loss/crossentropy": 2.138728439807892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34410610795021057, "step": 1048 }, { "epoch": 0.021, "grad_norm": 3.515625, "grad_norm_var": 0.046122233072916664, "learning_rate": 0.0001, "loss": 5.7371, "loss/crossentropy": 2.3748635053634644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3580169975757599, "step": 1050 }, { "epoch": 0.02104, "grad_norm": 3.125, "grad_norm_var": 0.044310506184895834, "learning_rate": 0.0001, "loss": 5.427, "loss/crossentropy": 2.061552882194519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31069953739643097, "step": 1052 }, { "epoch": 0.02108, "grad_norm": 3.515625, "grad_norm_var": 0.03769124348958333, "learning_rate": 0.0001, "loss": 5.3469, "loss/crossentropy": 2.299555718898773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31592319905757904, "step": 1054 }, { "epoch": 0.02112, "grad_norm": 3.53125, "grad_norm_var": 0.04571940104166667, "learning_rate": 0.0001, "loss": 6.1254, "loss/crossentropy": 2.4866377115249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37435297667980194, "step": 1056 }, { "epoch": 0.02116, "grad_norm": 3.203125, "grad_norm_var": 0.047684733072916666, "learning_rate": 0.0001, "loss": 5.5676, "loss/crossentropy": 1.8185054063796997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2784232199192047, "step": 1058 }, { "epoch": 0.0212, "grad_norm": 3.3125, "grad_norm_var": 0.04840087890625, "learning_rate": 0.0001, "loss": 5.6184, "loss/crossentropy": 2.215538501739502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3471361994743347, "step": 1060 }, { "epoch": 0.02124, "grad_norm": 3.078125, "grad_norm_var": 0.057616170247395834, "learning_rate": 0.0001, "loss": 5.7615, "loss/crossentropy": 2.6912894248962402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35911867022514343, "step": 1062 }, { "epoch": 0.02128, "grad_norm": 3.125, "grad_norm_var": 0.06670633951822917, "learning_rate": 0.0001, "loss": 5.3436, "loss/crossentropy": 1.9757090210914612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29868973791599274, "step": 1064 }, { "epoch": 0.02132, "grad_norm": 3.21875, "grad_norm_var": 0.052294921875, "learning_rate": 0.0001, "loss": 5.5334, "loss/crossentropy": 2.3396666049957275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3561312407255173, "step": 1066 }, { "epoch": 0.02136, "grad_norm": 3.359375, "grad_norm_var": 0.0477935791015625, "learning_rate": 0.0001, "loss": 5.7564, "loss/crossentropy": 2.3498982191085815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3842166066169739, "step": 1068 }, { "epoch": 0.0214, "grad_norm": 3.234375, "grad_norm_var": 0.03717041015625, "learning_rate": 0.0001, "loss": 5.8936, "loss/crossentropy": 2.037585139274597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3429017663002014, "step": 1070 }, { "epoch": 0.02144, "grad_norm": 3.484375, "grad_norm_var": 0.0216949462890625, "learning_rate": 0.0001, "loss": 5.6938, "loss/crossentropy": 2.4804376363754272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35338981449604034, "step": 1072 }, { "epoch": 0.02148, "grad_norm": 3.375, "grad_norm_var": 0.027448527018229165, "learning_rate": 0.0001, "loss": 5.8146, "loss/crossentropy": 2.5210201740264893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3769296407699585, "step": 1074 }, { "epoch": 0.02152, "grad_norm": 3.40625, "grad_norm_var": 0.03369852701822917, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.1258187294006348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3353133201599121, "step": 1076 }, { "epoch": 0.02156, "grad_norm": 3.328125, "grad_norm_var": 0.026676432291666666, "learning_rate": 0.0001, "loss": 5.5436, "loss/crossentropy": 2.2107938528060913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34667155146598816, "step": 1078 }, { "epoch": 0.0216, "grad_norm": 3.171875, "grad_norm_var": 0.0194244384765625, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 1.9614633321762085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2958581745624542, "step": 1080 }, { "epoch": 0.02164, "grad_norm": 3.515625, "grad_norm_var": 0.0194976806640625, "learning_rate": 0.0001, "loss": 5.8219, "loss/crossentropy": 2.1403249502182007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3220343589782715, "step": 1082 }, { "epoch": 0.02168, "grad_norm": 3.28125, "grad_norm_var": 0.022493489583333335, "learning_rate": 0.0001, "loss": 5.6037, "loss/crossentropy": 1.6533048152923584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814648747444153, "step": 1084 }, { "epoch": 0.02172, "grad_norm": 3.59375, "grad_norm_var": 0.0232086181640625, "learning_rate": 0.0001, "loss": 5.7731, "loss/crossentropy": 2.69880211353302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3835880607366562, "step": 1086 }, { "epoch": 0.02176, "grad_norm": 3.5, "grad_norm_var": 0.0240631103515625, "learning_rate": 0.0001, "loss": 5.8119, "loss/crossentropy": 2.214504837989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31686101853847504, "step": 1088 }, { "epoch": 0.0218, "grad_norm": 3.765625, "grad_norm_var": 0.027164713541666666, "learning_rate": 0.0001, "loss": 6.0376, "loss/crossentropy": 2.2377456426620483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3682183176279068, "step": 1090 }, { "epoch": 0.02184, "grad_norm": 3.359375, "grad_norm_var": 0.027799479166666665, "learning_rate": 0.0001, "loss": 5.7387, "loss/crossentropy": 2.0977545976638794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32313986122608185, "step": 1092 }, { "epoch": 0.02188, "grad_norm": 3.125, "grad_norm_var": 0.0371490478515625, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.1717870235443115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3619600385427475, "step": 1094 }, { "epoch": 0.02192, "grad_norm": 3.546875, "grad_norm_var": 0.03674723307291667, "learning_rate": 0.0001, "loss": 5.554, "loss/crossentropy": 2.2805471420288086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3328556418418884, "step": 1096 }, { "epoch": 0.02196, "grad_norm": 3.71875, "grad_norm_var": 0.04625244140625, "learning_rate": 0.0001, "loss": 5.8009, "loss/crossentropy": 2.034846782684326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3610256612300873, "step": 1098 }, { "epoch": 0.022, "grad_norm": 3.359375, "grad_norm_var": 0.04612528483072917, "learning_rate": 0.0001, "loss": 5.6121, "loss/crossentropy": 2.0208348631858826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3311958611011505, "step": 1100 }, { "epoch": 0.02204, "grad_norm": 3.59375, "grad_norm_var": 0.04737040201822917, "learning_rate": 0.0001, "loss": 5.7135, "loss/crossentropy": 2.1020554900169373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3638540059328079, "step": 1102 }, { "epoch": 0.02208, "grad_norm": 3.46875, "grad_norm_var": 0.046019490559895834, "learning_rate": 0.0001, "loss": 5.5099, "loss/crossentropy": 2.28346848487854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3434429168701172, "step": 1104 }, { "epoch": 0.02212, "grad_norm": 3.21875, "grad_norm_var": 0.1152740478515625, "learning_rate": 0.0001, "loss": 5.6866, "loss/crossentropy": 2.103623867034912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34000229835510254, "step": 1106 }, { "epoch": 0.02216, "grad_norm": 4.1875, "grad_norm_var": 0.14846089680989583, "learning_rate": 0.0001, "loss": 5.5196, "loss/crossentropy": 2.205894947052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3432965874671936, "step": 1108 }, { "epoch": 0.0222, "grad_norm": 3.109375, "grad_norm_var": 0.16467183430989582, "learning_rate": 0.0001, "loss": 5.6166, "loss/crossentropy": 2.395035982131958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3834904432296753, "step": 1110 }, { "epoch": 0.02224, "grad_norm": 3.296875, "grad_norm_var": 0.16505533854166668, "learning_rate": 0.0001, "loss": 5.9042, "loss/crossentropy": 2.3755353689193726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33256760239601135, "step": 1112 }, { "epoch": 0.02228, "grad_norm": 3.28125, "grad_norm_var": 0.16402079264322916, "learning_rate": 0.0001, "loss": 6.0135, "loss/crossentropy": 2.6754449605941772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3524845540523529, "step": 1114 }, { "epoch": 0.02232, "grad_norm": 3.515625, "grad_norm_var": 0.15730692545572916, "learning_rate": 0.0001, "loss": 5.6448, "loss/crossentropy": 2.2398552894592285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.343609020113945, "step": 1116 }, { "epoch": 0.02236, "grad_norm": 3.3125, "grad_norm_var": 0.16770833333333332, "learning_rate": 0.0001, "loss": 5.3075, "loss/crossentropy": 2.399322271347046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34514427185058594, "step": 1118 }, { "epoch": 0.0224, "grad_norm": 3.3125, "grad_norm_var": 0.17229410807291667, "learning_rate": 0.0001, "loss": 5.8886, "loss/crossentropy": 2.4002050161361694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35784730315208435, "step": 1120 }, { "epoch": 0.02244, "grad_norm": 3.21875, "grad_norm_var": 0.10064188639322917, "learning_rate": 0.0001, "loss": 5.6239, "loss/crossentropy": 2.293683171272278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37052902579307556, "step": 1122 }, { "epoch": 0.02248, "grad_norm": 3.234375, "grad_norm_var": 0.0532379150390625, "learning_rate": 0.0001, "loss": 5.8244, "loss/crossentropy": 2.2177391052246094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3482564836740494, "step": 1124 }, { "epoch": 0.02252, "grad_norm": 4.1875, "grad_norm_var": 0.0637603759765625, "learning_rate": 0.0001, "loss": 5.8959, "loss/crossentropy": 2.288211703300476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3401540517807007, "step": 1126 }, { "epoch": 0.02256, "grad_norm": 3.1875, "grad_norm_var": 0.0933258056640625, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.1786144971847534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.331302285194397, "step": 1128 }, { "epoch": 0.0226, "grad_norm": 3.5, "grad_norm_var": 0.09163004557291667, "learning_rate": 0.0001, "loss": 5.7604, "loss/crossentropy": 2.0390175580978394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32953669130802155, "step": 1130 }, { "epoch": 0.02264, "grad_norm": 3.5, "grad_norm_var": 0.0921295166015625, "learning_rate": 0.0001, "loss": 5.6952, "loss/crossentropy": 2.188543677330017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3585694134235382, "step": 1132 }, { "epoch": 0.02268, "grad_norm": 3.296875, "grad_norm_var": 0.08284098307291667, "learning_rate": 0.0001, "loss": 5.732, "loss/crossentropy": 2.0731694102287292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3121813088655472, "step": 1134 }, { "epoch": 0.02272, "grad_norm": 3.390625, "grad_norm_var": 0.0794097900390625, "learning_rate": 0.0001, "loss": 5.6306, "loss/crossentropy": 2.144552707672119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3303966820240021, "step": 1136 }, { "epoch": 0.02276, "grad_norm": 3.890625, "grad_norm_var": 0.09900614420572916, "learning_rate": 0.0001, "loss": 5.3794, "loss/crossentropy": 2.1617428064346313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3420899361371994, "step": 1138 }, { "epoch": 0.0228, "grad_norm": 3.109375, "grad_norm_var": 0.12813212076822916, "learning_rate": 0.0001, "loss": 5.2834, "loss/crossentropy": 1.8098865747451782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28376901149749756, "step": 1140 }, { "epoch": 0.02284, "grad_norm": 2.984375, "grad_norm_var": 0.09795633951822917, "learning_rate": 0.0001, "loss": 5.3911, "loss/crossentropy": 2.133797824382782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3587050139904022, "step": 1142 }, { "epoch": 0.02288, "grad_norm": 4.96875, "grad_norm_var": 0.24221903483072918, "learning_rate": 0.0001, "loss": 5.8787, "loss/crossentropy": 2.378090739250183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4244185537099838, "step": 1144 }, { "epoch": 0.02292, "grad_norm": 3.03125, "grad_norm_var": 0.2736887613932292, "learning_rate": 0.0001, "loss": 5.6692, "loss/crossentropy": 2.4442414045333862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3569464683532715, "step": 1146 }, { "epoch": 0.02296, "grad_norm": 3.671875, "grad_norm_var": 0.29189453125, "learning_rate": 0.0001, "loss": 5.5468, "loss/crossentropy": 2.6446973085403442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36859095096588135, "step": 1148 }, { "epoch": 0.023, "grad_norm": 4.4375, "grad_norm_var": 0.3506988525390625, "learning_rate": 0.0001, "loss": 5.9727, "loss/crossentropy": 2.4100207090377808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4082309305667877, "step": 1150 }, { "epoch": 0.02304, "grad_norm": 3.25, "grad_norm_var": 0.3552317301432292, "learning_rate": 0.0001, "loss": 5.3537, "loss/crossentropy": 2.1472485661506653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3299275189638138, "step": 1152 }, { "epoch": 0.02308, "grad_norm": 3.21875, "grad_norm_var": 0.3376373291015625, "learning_rate": 0.0001, "loss": 5.4393, "loss/crossentropy": 2.1891872882843018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.356732040643692, "step": 1154 }, { "epoch": 0.02312, "grad_norm": 3.71875, "grad_norm_var": 0.30201416015625, "learning_rate": 0.0001, "loss": 6.0049, "loss/crossentropy": 2.1432933807373047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.360423281788826, "step": 1156 }, { "epoch": 0.02316, "grad_norm": 3.1875, "grad_norm_var": 0.2923248291015625, "learning_rate": 0.0001, "loss": 5.5541, "loss/crossentropy": 2.176819324493408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.340317040681839, "step": 1158 }, { "epoch": 0.0232, "grad_norm": 2.96875, "grad_norm_var": 0.17407938639322917, "learning_rate": 0.0001, "loss": 5.2312, "loss/crossentropy": 2.325207471847534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3309635668992996, "step": 1160 }, { "epoch": 0.02324, "grad_norm": 3.515625, "grad_norm_var": 0.14537353515625, "learning_rate": 0.0001, "loss": 5.7322, "loss/crossentropy": 2.2536743879318237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31422293186187744, "step": 1162 }, { "epoch": 0.02328, "grad_norm": 3.328125, "grad_norm_var": 0.12657877604166667, "learning_rate": 0.0001, "loss": 5.4354, "loss/crossentropy": 2.180476427078247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2956361174583435, "step": 1164 }, { "epoch": 0.02332, "grad_norm": 3.15625, "grad_norm_var": 0.0527008056640625, "learning_rate": 0.0001, "loss": 5.6067, "loss/crossentropy": 1.995088815689087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2936599552631378, "step": 1166 }, { "epoch": 0.02336, "grad_norm": 3.1875, "grad_norm_var": 0.05378316243489583, "learning_rate": 0.0001, "loss": 5.7304, "loss/crossentropy": 2.2555994987487793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3630402684211731, "step": 1168 }, { "epoch": 0.0234, "grad_norm": 3.296875, "grad_norm_var": 0.05856119791666667, "learning_rate": 0.0001, "loss": 5.1613, "loss/crossentropy": 2.0441415905952454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2960120141506195, "step": 1170 }, { "epoch": 0.02344, "grad_norm": 3.25, "grad_norm_var": 0.0204498291015625, "learning_rate": 0.0001, "loss": 5.2238, "loss/crossentropy": 1.8090497255325317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29750876128673553, "step": 1172 }, { "epoch": 0.02348, "grad_norm": 3.234375, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 5.2506, "loss/crossentropy": 2.2780312299728394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3376633822917938, "step": 1174 }, { "epoch": 0.02352, "grad_norm": 3.4375, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 5.53, "loss/crossentropy": 1.8047232627868652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29132279753685, "step": 1176 }, { "epoch": 0.02356, "grad_norm": 3.265625, "grad_norm_var": 0.020417277018229166, "learning_rate": 0.0001, "loss": 5.4149, "loss/crossentropy": 2.203469753265381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3124798536300659, "step": 1178 }, { "epoch": 0.0236, "grad_norm": 3.34375, "grad_norm_var": 0.019775390625, "learning_rate": 0.0001, "loss": 5.4481, "loss/crossentropy": 2.078102231025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.331977054476738, "step": 1180 }, { "epoch": 0.02364, "grad_norm": 3.765625, "grad_norm_var": 0.0368316650390625, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.2701854705810547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3202142268419266, "step": 1182 }, { "epoch": 0.02368, "grad_norm": 3.671875, "grad_norm_var": 0.047240193684895834, "learning_rate": 0.0001, "loss": 5.6414, "loss/crossentropy": 1.930423617362976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3192301094532013, "step": 1184 }, { "epoch": 0.02372, "grad_norm": 3.546875, "grad_norm_var": 0.06542867024739583, "learning_rate": 0.0001, "loss": 6.0688, "loss/crossentropy": 2.291581869125366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3759836256504059, "step": 1186 }, { "epoch": 0.02376, "grad_norm": 3.375, "grad_norm_var": 0.06529541015625, "learning_rate": 0.0001, "loss": 5.6689, "loss/crossentropy": 1.9887789487838745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31285202503204346, "step": 1188 }, { "epoch": 0.0238, "grad_norm": 3.109375, "grad_norm_var": 0.054671223958333334, "learning_rate": 0.0001, "loss": 5.7457, "loss/crossentropy": 2.290129065513611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3461494445800781, "step": 1190 }, { "epoch": 0.02384, "grad_norm": 3.6875, "grad_norm_var": 0.0604888916015625, "learning_rate": 0.0001, "loss": 6.288, "loss/crossentropy": 2.3252567052841187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3817393332719803, "step": 1192 }, { "epoch": 0.02388, "grad_norm": 3.21875, "grad_norm_var": 0.06230367024739583, "learning_rate": 0.0001, "loss": 5.4498, "loss/crossentropy": 1.736217737197876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28928153216838837, "step": 1194 }, { "epoch": 0.02392, "grad_norm": 3.09375, "grad_norm_var": 0.06886393229166667, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.112093210220337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3196643739938736, "step": 1196 }, { "epoch": 0.02396, "grad_norm": 3.15625, "grad_norm_var": 0.07258707682291667, "learning_rate": 0.0001, "loss": 5.3552, "loss/crossentropy": 2.3334370851516724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3539857119321823, "step": 1198 }, { "epoch": 0.024, "grad_norm": 3.484375, "grad_norm_var": 0.09695536295572917, "learning_rate": 0.0001, "loss": 5.3293, "loss/crossentropy": 1.7393567562103271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3211805671453476, "step": 1200 }, { "epoch": 0.02404, "grad_norm": 3.4375, "grad_norm_var": 0.07935791015625, "learning_rate": 0.0001, "loss": 5.6292, "loss/crossentropy": 2.314788579940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37599092721939087, "step": 1202 }, { "epoch": 0.02408, "grad_norm": 3.1875, "grad_norm_var": 0.076318359375, "learning_rate": 0.0001, "loss": 5.435, "loss/crossentropy": 2.2820088863372803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33955833315849304, "step": 1204 }, { "epoch": 0.02412, "grad_norm": 3.46875, "grad_norm_var": 0.07183837890625, "learning_rate": 0.0001, "loss": 5.87, "loss/crossentropy": 2.012476146221161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3048281967639923, "step": 1206 }, { "epoch": 0.02416, "grad_norm": 3.1875, "grad_norm_var": 0.07356669108072916, "learning_rate": 0.0001, "loss": 5.6751, "loss/crossentropy": 2.315858483314514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32628118991851807, "step": 1208 }, { "epoch": 0.0242, "grad_norm": 3.359375, "grad_norm_var": 0.14773763020833333, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.3260581493377686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.352965384721756, "step": 1210 }, { "epoch": 0.02424, "grad_norm": 4.0625, "grad_norm_var": 0.1748687744140625, "learning_rate": 0.0001, "loss": 5.689, "loss/crossentropy": 2.199007749557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3089260905981064, "step": 1212 }, { "epoch": 0.02428, "grad_norm": 3.703125, "grad_norm_var": 1.4942047119140625, "learning_rate": 0.0001, "loss": 5.798, "loss/crossentropy": 2.30281138420105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3629491478204727, "step": 1214 }, { "epoch": 0.02432, "grad_norm": 3.1875, "grad_norm_var": 1.5125640869140624, "learning_rate": 0.0001, "loss": 5.4927, "loss/crossentropy": 2.238295316696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3436104357242584, "step": 1216 }, { "epoch": 0.02436, "grad_norm": 3.9375, "grad_norm_var": 1.501488240559896, "learning_rate": 0.0001, "loss": 5.5763, "loss/crossentropy": 2.52456271648407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40382860600948334, "step": 1218 }, { "epoch": 0.0244, "grad_norm": 3.46875, "grad_norm_var": 1.47197265625, "learning_rate": 0.0001, "loss": 5.8796, "loss/crossentropy": 2.4516665935516357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.378818541765213, "step": 1220 }, { "epoch": 0.02444, "grad_norm": 3.046875, "grad_norm_var": 1.485480753580729, "learning_rate": 0.0001, "loss": 5.5438, "loss/crossentropy": 2.593857169151306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3743816912174225, "step": 1222 }, { "epoch": 0.02448, "grad_norm": 3.375, "grad_norm_var": 1.4396799723307292, "learning_rate": 0.0001, "loss": 5.2807, "loss/crossentropy": 1.873874843120575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2743247449398041, "step": 1224 }, { "epoch": 0.02452, "grad_norm": 3.265625, "grad_norm_var": 1.4642862955729166, "learning_rate": 0.0001, "loss": 5.6455, "loss/crossentropy": 2.1173152923583984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.350399985909462, "step": 1226 }, { "epoch": 0.02456, "grad_norm": 3.5, "grad_norm_var": 1.4721018473307292, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.3650271892547607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3241463750600815, "step": 1228 }, { "epoch": 0.0246, "grad_norm": 3.1875, "grad_norm_var": 0.08748270670572916, "learning_rate": 0.0001, "loss": 5.5229, "loss/crossentropy": 2.180622935295105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31293927133083344, "step": 1230 }, { "epoch": 0.02464, "grad_norm": 3.546875, "grad_norm_var": 0.08329976399739583, "learning_rate": 0.0001, "loss": 5.7443, "loss/crossentropy": 2.230265259742737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3405953049659729, "step": 1232 }, { "epoch": 0.02468, "grad_norm": 3.6875, "grad_norm_var": 0.07164713541666666, "learning_rate": 0.0001, "loss": 5.5518, "loss/crossentropy": 2.050285518169403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32737791538238525, "step": 1234 }, { "epoch": 0.02472, "grad_norm": 3.140625, "grad_norm_var": 0.07604166666666666, "learning_rate": 0.0001, "loss": 5.4412, "loss/crossentropy": 2.2357693910598755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34366659820079803, "step": 1236 }, { "epoch": 0.02476, "grad_norm": 3.09375, "grad_norm_var": 0.09081624348958334, "learning_rate": 0.0001, "loss": 5.6066, "loss/crossentropy": 2.308778762817383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33825138211250305, "step": 1238 }, { "epoch": 0.0248, "grad_norm": 3.390625, "grad_norm_var": 0.0979888916015625, "learning_rate": 0.0001, "loss": 5.419, "loss/crossentropy": 2.016503393650055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30135589838027954, "step": 1240 }, { "epoch": 0.02484, "grad_norm": 3.359375, "grad_norm_var": 0.0870025634765625, "learning_rate": 0.0001, "loss": 5.2494, "loss/crossentropy": 1.8450073599815369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34388674795627594, "step": 1242 }, { "epoch": 0.02488, "grad_norm": 3.515625, "grad_norm_var": 0.05263671875, "learning_rate": 0.0001, "loss": 5.7092, "loss/crossentropy": 2.486730694770813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35796378552913666, "step": 1244 }, { "epoch": 0.02492, "grad_norm": 2.859375, "grad_norm_var": 0.0688385009765625, "learning_rate": 0.0001, "loss": 5.2565, "loss/crossentropy": 2.010675370693207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31364670395851135, "step": 1246 }, { "epoch": 0.02496, "grad_norm": 3.15625, "grad_norm_var": 0.07418619791666667, "learning_rate": 0.0001, "loss": 5.366, "loss/crossentropy": 2.128747880458832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3106560483574867, "step": 1248 }, { "epoch": 0.025, "grad_norm": 3.359375, "grad_norm_var": 0.06523335774739583, "learning_rate": 0.0001, "loss": 5.8005, "loss/crossentropy": 2.4563735723495483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33769866824150085, "step": 1250 }, { "epoch": 0.02504, "grad_norm": 3.15625, "grad_norm_var": 0.06453450520833333, "learning_rate": 0.0001, "loss": 5.372, "loss/crossentropy": 2.3785592317581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36167748272418976, "step": 1252 }, { "epoch": 0.02508, "grad_norm": 3.21875, "grad_norm_var": 0.029069010416666666, "learning_rate": 0.0001, "loss": 5.5506, "loss/crossentropy": 2.2642308473587036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3315223157405853, "step": 1254 }, { "epoch": 0.02512, "grad_norm": 3.46875, "grad_norm_var": 0.028416951497395832, "learning_rate": 0.0001, "loss": 5.4608, "loss/crossentropy": 2.2246991395950317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33083613216876984, "step": 1256 }, { "epoch": 0.02516, "grad_norm": 3.1875, "grad_norm_var": 0.0290435791015625, "learning_rate": 0.0001, "loss": 5.762, "loss/crossentropy": 2.129785656929016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32672610878944397, "step": 1258 }, { "epoch": 0.0252, "grad_norm": 3.125, "grad_norm_var": 0.0243804931640625, "learning_rate": 0.0001, "loss": 5.7297, "loss/crossentropy": 2.0835453271865845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3504233658313751, "step": 1260 }, { "epoch": 0.02524, "grad_norm": 3.703125, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 5.6707, "loss/crossentropy": 2.443893313407898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3636191487312317, "step": 1262 }, { "epoch": 0.02528, "grad_norm": 3.21875, "grad_norm_var": 0.02197265625, "learning_rate": 0.0001, "loss": 5.6107, "loss/crossentropy": 2.367433190345764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35734108090400696, "step": 1264 }, { "epoch": 0.02532, "grad_norm": 3.1875, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 5.049, "loss/crossentropy": 1.8102024793624878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28676700592041016, "step": 1266 }, { "epoch": 0.02536, "grad_norm": 3.0, "grad_norm_var": 0.03738606770833333, "learning_rate": 0.0001, "loss": 5.3916, "loss/crossentropy": 2.54524827003479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3380637466907501, "step": 1268 }, { "epoch": 0.0254, "grad_norm": 2.984375, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 5.6667, "loss/crossentropy": 2.4164276123046875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36949749290943146, "step": 1270 }, { "epoch": 0.02544, "grad_norm": 3.515625, "grad_norm_var": 0.043745930989583334, "learning_rate": 0.0001, "loss": 5.4235, "loss/crossentropy": 2.4335602521896362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3608807325363159, "step": 1272 }, { "epoch": 0.02548, "grad_norm": 3.1875, "grad_norm_var": 0.0422760009765625, "learning_rate": 0.0001, "loss": 5.5858, "loss/crossentropy": 2.2711308002471924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3631722033023834, "step": 1274 }, { "epoch": 0.02552, "grad_norm": 3.140625, "grad_norm_var": 0.0430816650390625, "learning_rate": 0.0001, "loss": 5.181, "loss/crossentropy": 2.378043293952942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35628968477249146, "step": 1276 }, { "epoch": 0.02556, "grad_norm": 3.1875, "grad_norm_var": 0.02431640625, "learning_rate": 0.0001, "loss": 5.5721, "loss/crossentropy": 1.8950039148330688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3105602264404297, "step": 1278 }, { "epoch": 0.0256, "grad_norm": 2.96875, "grad_norm_var": 0.026659138997395835, "learning_rate": 0.0001, "loss": 5.4649, "loss/crossentropy": 1.8309656977653503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27458132803440094, "step": 1280 }, { "epoch": 0.02564, "grad_norm": 3.40625, "grad_norm_var": 0.034195963541666666, "learning_rate": 0.0001, "loss": 5.993, "loss/crossentropy": 2.3949296474456787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37693680822849274, "step": 1282 }, { "epoch": 0.02568, "grad_norm": 3.140625, "grad_norm_var": 0.026725260416666667, "learning_rate": 0.0001, "loss": 5.6157, "loss/crossentropy": 2.497879147529602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36076007783412933, "step": 1284 }, { "epoch": 0.02572, "grad_norm": 2.953125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 5.354, "loss/crossentropy": 2.108432352542877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3115152269601822, "step": 1286 }, { "epoch": 0.02576, "grad_norm": 3.171875, "grad_norm_var": 0.0201568603515625, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.079313635826111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31575673818588257, "step": 1288 }, { "epoch": 0.0258, "grad_norm": 3.15625, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 5.2594, "loss/crossentropy": 2.3390332460403442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3293873071670532, "step": 1290 }, { "epoch": 0.02584, "grad_norm": 3.15625, "grad_norm_var": 0.019612630208333332, "learning_rate": 0.0001, "loss": 5.2895, "loss/crossentropy": 2.180980920791626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30600421130657196, "step": 1292 }, { "epoch": 0.02588, "grad_norm": 2.9375, "grad_norm_var": 0.022858683268229166, "learning_rate": 0.0001, "loss": 5.2784, "loss/crossentropy": 2.0647836327552795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3114248663187027, "step": 1294 }, { "epoch": 0.02592, "grad_norm": 2.84375, "grad_norm_var": 0.02958984375, "learning_rate": 0.0001, "loss": 5.5063, "loss/crossentropy": 1.931971788406372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32757866382598877, "step": 1296 }, { "epoch": 0.02596, "grad_norm": 3.09375, "grad_norm_var": 0.0202301025390625, "learning_rate": 0.0001, "loss": 5.6389, "loss/crossentropy": 2.1180718541145325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3272206783294678, "step": 1298 }, { "epoch": 0.026, "grad_norm": 3.4375, "grad_norm_var": 0.024616495768229166, "learning_rate": 0.0001, "loss": 5.5069, "loss/crossentropy": 1.8535473346710205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3100028932094574, "step": 1300 }, { "epoch": 0.02604, "grad_norm": 4.5, "grad_norm_var": 0.16031901041666666, "learning_rate": 0.0001, "loss": 5.574, "loss/crossentropy": 1.9625197052955627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3074956685304642, "step": 1302 }, { "epoch": 0.02608, "grad_norm": 3.25, "grad_norm_var": 0.1673736572265625, "learning_rate": 0.0001, "loss": 5.3764, "loss/crossentropy": 2.248521149158478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33484284579753876, "step": 1304 }, { "epoch": 0.02612, "grad_norm": 2.921875, "grad_norm_var": 0.1750640869140625, "learning_rate": 0.0001, "loss": 5.8318, "loss/crossentropy": 2.6374051570892334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3604440838098526, "step": 1306 }, { "epoch": 0.02616, "grad_norm": 4.0625, "grad_norm_var": 0.22333984375, "learning_rate": 0.0001, "loss": 5.5151, "loss/crossentropy": 2.3213003873825073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3659953773021698, "step": 1308 }, { "epoch": 0.0262, "grad_norm": 3.734375, "grad_norm_var": 0.21988525390625, "learning_rate": 0.0001, "loss": 5.9334, "loss/crossentropy": 2.3527311086654663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3638792932033539, "step": 1310 }, { "epoch": 0.02624, "grad_norm": 3.140625, "grad_norm_var": 0.20551656087239584, "learning_rate": 0.0001, "loss": 5.4442, "loss/crossentropy": 1.6319801807403564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29070258140563965, "step": 1312 }, { "epoch": 0.02628, "grad_norm": 3.125, "grad_norm_var": 0.215966796875, "learning_rate": 0.0001, "loss": 5.1864, "loss/crossentropy": 1.986265480518341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30515219271183014, "step": 1314 }, { "epoch": 0.02632, "grad_norm": 2.828125, "grad_norm_var": 0.23336181640625, "learning_rate": 0.0001, "loss": 5.5808, "loss/crossentropy": 2.237283766269684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32876546680927277, "step": 1316 }, { "epoch": 0.02636, "grad_norm": 3.859375, "grad_norm_var": 0.13362223307291668, "learning_rate": 0.0001, "loss": 5.4868, "loss/crossentropy": 2.215467691421509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34449851512908936, "step": 1318 }, { "epoch": 0.0264, "grad_norm": 3.03125, "grad_norm_var": 0.12567952473958333, "learning_rate": 0.0001, "loss": 5.7427, "loss/crossentropy": 2.264952063560486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.387825608253479, "step": 1320 }, { "epoch": 0.02644, "grad_norm": 3.125, "grad_norm_var": 0.12009989420572917, "learning_rate": 0.0001, "loss": 5.5614, "loss/crossentropy": 2.0678945779800415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3228907287120819, "step": 1322 }, { "epoch": 0.02648, "grad_norm": 4.40625, "grad_norm_var": 0.212841796875, "learning_rate": 0.0001, "loss": 5.6259, "loss/crossentropy": 2.1414765119552612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31144315004348755, "step": 1324 }, { "epoch": 0.02652, "grad_norm": 4.03125, "grad_norm_var": 0.23813374837239584, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.3898890018463135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3721010088920593, "step": 1326 }, { "epoch": 0.02656, "grad_norm": 3.421875, "grad_norm_var": 0.23483784993489584, "learning_rate": 0.0001, "loss": 5.5417, "loss/crossentropy": 2.404030442237854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36486634612083435, "step": 1328 }, { "epoch": 0.0266, "grad_norm": 3.15625, "grad_norm_var": 0.22004801432291668, "learning_rate": 0.0001, "loss": 5.7535, "loss/crossentropy": 2.1436617970466614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091920465230942, "step": 1330 }, { "epoch": 0.02664, "grad_norm": 3.1875, "grad_norm_var": 0.2074615478515625, "learning_rate": 0.0001, "loss": 5.2832, "loss/crossentropy": 2.106055796146393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3344803601503372, "step": 1332 }, { "epoch": 0.02668, "grad_norm": 3.640625, "grad_norm_var": 0.19583333333333333, "learning_rate": 0.0001, "loss": 6.0091, "loss/crossentropy": 2.465435266494751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34129244089126587, "step": 1334 }, { "epoch": 0.02672, "grad_norm": 3.21875, "grad_norm_var": 0.18728841145833333, "learning_rate": 0.0001, "loss": 5.5569, "loss/crossentropy": 1.9109253883361816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30202071368694305, "step": 1336 }, { "epoch": 0.02676, "grad_norm": 3.0, "grad_norm_var": 0.1890045166015625, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.267784833908081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32984335720539093, "step": 1338 }, { "epoch": 0.0268, "grad_norm": 3.0625, "grad_norm_var": 0.07266337076822917, "learning_rate": 0.0001, "loss": 5.4432, "loss/crossentropy": 2.6131194829940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3797858655452728, "step": 1340 }, { "epoch": 0.02684, "grad_norm": 3.171875, "grad_norm_var": 0.031891886393229166, "learning_rate": 0.0001, "loss": 5.7456, "loss/crossentropy": 2.214663505554199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.340796560049057, "step": 1342 }, { "epoch": 0.02688, "grad_norm": 3.0, "grad_norm_var": 0.03245340983072917, "learning_rate": 0.0001, "loss": 5.4205, "loss/crossentropy": 2.0236783027648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3101559728384018, "step": 1344 }, { "epoch": 0.02692, "grad_norm": 3.078125, "grad_norm_var": 0.030565388997395835, "learning_rate": 0.0001, "loss": 5.2671, "loss/crossentropy": 2.3260135650634766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.339004784822464, "step": 1346 }, { "epoch": 0.02696, "grad_norm": 2.96875, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 5.6529, "loss/crossentropy": 2.177807927131653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3171197772026062, "step": 1348 }, { "epoch": 0.027, "grad_norm": 3.375, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 5.4682, "loss/crossentropy": 2.351730227470398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34749266505241394, "step": 1350 }, { "epoch": 0.02704, "grad_norm": 2.953125, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 5.3692, "loss/crossentropy": 2.2959564924240112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3270048499107361, "step": 1352 }, { "epoch": 0.02708, "grad_norm": 3.03125, "grad_norm_var": 0.016890462239583334, "learning_rate": 0.0001, "loss": 5.5243, "loss/crossentropy": 2.399070382118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32505376636981964, "step": 1354 }, { "epoch": 0.02712, "grad_norm": 3.046875, "grad_norm_var": 0.01812744140625, "learning_rate": 0.0001, "loss": 5.3306, "loss/crossentropy": 1.9084061980247498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3073619455099106, "step": 1356 }, { "epoch": 0.02716, "grad_norm": 3.078125, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 5.3942, "loss/crossentropy": 2.1204254627227783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2885961979627609, "step": 1358 }, { "epoch": 0.0272, "grad_norm": 3.28125, "grad_norm_var": 0.014842732747395834, "learning_rate": 0.0001, "loss": 5.7313, "loss/crossentropy": 2.0167239904403687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3026747703552246, "step": 1360 }, { "epoch": 0.02724, "grad_norm": 3.171875, "grad_norm_var": 0.014742024739583333, "learning_rate": 0.0001, "loss": 5.3987, "loss/crossentropy": 1.9588357210159302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2976878881454468, "step": 1362 }, { "epoch": 0.02728, "grad_norm": 2.796875, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 5.1658, "loss/crossentropy": 1.8169561624526978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2874959260225296, "step": 1364 }, { "epoch": 0.02732, "grad_norm": 3.1875, "grad_norm_var": 0.014354451497395834, "learning_rate": 0.0001, "loss": 5.5128, "loss/crossentropy": 2.258527398109436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34180621802806854, "step": 1366 }, { "epoch": 0.02736, "grad_norm": 3.09375, "grad_norm_var": 0.017284138997395834, "learning_rate": 0.0001, "loss": 5.4374, "loss/crossentropy": 2.1959571838378906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3124794214963913, "step": 1368 }, { "epoch": 0.0274, "grad_norm": 2.875, "grad_norm_var": 0.0198150634765625, "learning_rate": 0.0001, "loss": 5.2299, "loss/crossentropy": 2.1830934286117554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3260872811079025, "step": 1370 }, { "epoch": 0.02744, "grad_norm": 3.203125, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 5.6831, "loss/crossentropy": 2.411653518676758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3575899302959442, "step": 1372 }, { "epoch": 0.02748, "grad_norm": 3.140625, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 5.3919, "loss/crossentropy": 2.1585222482681274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30734311044216156, "step": 1374 }, { "epoch": 0.02752, "grad_norm": 2.90625, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 5.37, "loss/crossentropy": 2.3621217012405396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3373589664697647, "step": 1376 }, { "epoch": 0.02756, "grad_norm": 3.109375, "grad_norm_var": 0.023160807291666665, "learning_rate": 0.0001, "loss": 5.4712, "loss/crossentropy": 2.1203317046165466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28425413370132446, "step": 1378 }, { "epoch": 0.0276, "grad_norm": 3.296875, "grad_norm_var": 0.0223297119140625, "learning_rate": 0.0001, "loss": 5.5438, "loss/crossentropy": 2.2568705081939697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.296497106552124, "step": 1380 }, { "epoch": 0.02764, "grad_norm": 3.5625, "grad_norm_var": 0.03665364583333333, "learning_rate": 0.0001, "loss": 5.5613, "loss/crossentropy": 2.260026216506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34534430503845215, "step": 1382 }, { "epoch": 0.02768, "grad_norm": 5.5625, "grad_norm_var": 0.4100494384765625, "learning_rate": 0.0001, "loss": 5.6217, "loss/crossentropy": 1.9400787949562073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32853779196739197, "step": 1384 }, { "epoch": 0.02772, "grad_norm": 3.390625, "grad_norm_var": 0.39205322265625, "learning_rate": 0.0001, "loss": 5.2009, "loss/crossentropy": 2.168904423713684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3421178460121155, "step": 1386 }, { "epoch": 0.02776, "grad_norm": 2.90625, "grad_norm_var": 0.405419921875, "learning_rate": 0.0001, "loss": 5.3992, "loss/crossentropy": 2.3474777936935425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31982940435409546, "step": 1388 }, { "epoch": 0.0278, "grad_norm": 3.140625, "grad_norm_var": 0.40615234375, "learning_rate": 0.0001, "loss": 5.5791, "loss/crossentropy": 2.3416868448257446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3400905281305313, "step": 1390 }, { "epoch": 0.02784, "grad_norm": 2.8125, "grad_norm_var": 0.4003570556640625, "learning_rate": 0.0001, "loss": 5.4413, "loss/crossentropy": 2.299672842025757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3143058717250824, "step": 1392 }, { "epoch": 0.02788, "grad_norm": 3.15625, "grad_norm_var": 0.3945058186848958, "learning_rate": 0.0001, "loss": 5.6231, "loss/crossentropy": 2.3058812618255615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33790935575962067, "step": 1394 }, { "epoch": 0.02792, "grad_norm": 3.03125, "grad_norm_var": 0.3947987874348958, "learning_rate": 0.0001, "loss": 5.324, "loss/crossentropy": 2.2137999534606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33156970143318176, "step": 1396 }, { "epoch": 0.02796, "grad_norm": 4.1875, "grad_norm_var": 0.4427571614583333, "learning_rate": 0.0001, "loss": 5.4567, "loss/crossentropy": 2.04184353351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32263143360614777, "step": 1398 }, { "epoch": 0.028, "grad_norm": 3.265625, "grad_norm_var": 0.09589436848958334, "learning_rate": 0.0001, "loss": 5.339, "loss/crossentropy": 2.0377472639083862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3136487454175949, "step": 1400 }, { "epoch": 0.02804, "grad_norm": 3.171875, "grad_norm_var": 0.09228413899739583, "learning_rate": 0.0001, "loss": 5.5838, "loss/crossentropy": 2.5366055965423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3476262539625168, "step": 1402 }, { "epoch": 0.02808, "grad_norm": 3.1875, "grad_norm_var": 0.09063212076822917, "learning_rate": 0.0001, "loss": 5.2447, "loss/crossentropy": 2.088012456893921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31111815571784973, "step": 1404 }, { "epoch": 0.02812, "grad_norm": 3.53125, "grad_norm_var": 0.10530598958333333, "learning_rate": 0.0001, "loss": 5.7316, "loss/crossentropy": 2.1750329732894897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3138856291770935, "step": 1406 }, { "epoch": 0.02816, "grad_norm": 3.625, "grad_norm_var": 0.09811909993489583, "learning_rate": 0.0001, "loss": 5.4708, "loss/crossentropy": 1.977031648159027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3179774433374405, "step": 1408 }, { "epoch": 0.0282, "grad_norm": 3.296875, "grad_norm_var": 0.12561848958333333, "learning_rate": 0.0001, "loss": 5.3815, "loss/crossentropy": 2.0594210028648376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32200056314468384, "step": 1410 }, { "epoch": 0.02824, "grad_norm": 3.59375, "grad_norm_var": 0.11237691243489584, "learning_rate": 0.0001, "loss": 5.443, "loss/crossentropy": 2.3887473344802856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3303599953651428, "step": 1412 }, { "epoch": 0.02828, "grad_norm": 3.453125, "grad_norm_var": 0.06500244140625, "learning_rate": 0.0001, "loss": 5.5507, "loss/crossentropy": 2.188898801803589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3378629684448242, "step": 1414 }, { "epoch": 0.02832, "grad_norm": 3.171875, "grad_norm_var": 0.06982014973958334, "learning_rate": 0.0001, "loss": 5.458, "loss/crossentropy": 1.981561303138733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814937084913254, "step": 1416 }, { "epoch": 0.02836, "grad_norm": 3.1875, "grad_norm_var": 0.0756011962890625, "learning_rate": 0.0001, "loss": 5.271, "loss/crossentropy": 2.2141716480255127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3429889380931854, "step": 1418 }, { "epoch": 0.0284, "grad_norm": 3.1875, "grad_norm_var": 0.0818267822265625, "learning_rate": 0.0001, "loss": 5.048, "loss/crossentropy": 2.0720977783203125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30601558089256287, "step": 1420 }, { "epoch": 0.02844, "grad_norm": 3.21875, "grad_norm_var": 0.07434488932291666, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.0516344904899597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3182393014431, "step": 1422 }, { "epoch": 0.02848, "grad_norm": 3.28125, "grad_norm_var": 0.070166015625, "learning_rate": 0.0001, "loss": 5.6532, "loss/crossentropy": 2.161284327507019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3383013904094696, "step": 1424 }, { "epoch": 0.02852, "grad_norm": 3.1875, "grad_norm_var": 0.03163960774739583, "learning_rate": 0.0001, "loss": 5.1594, "loss/crossentropy": 1.9955796599388123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3169983923435211, "step": 1426 }, { "epoch": 0.02856, "grad_norm": 2.984375, "grad_norm_var": 0.023421223958333334, "learning_rate": 0.0001, "loss": 5.3951, "loss/crossentropy": 2.1046979427337646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30594320595264435, "step": 1428 }, { "epoch": 0.0286, "grad_norm": 2.90625, "grad_norm_var": 0.020231119791666665, "learning_rate": 0.0001, "loss": 5.364, "loss/crossentropy": 1.9611601829528809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3039780706167221, "step": 1430 }, { "epoch": 0.02864, "grad_norm": 2.9375, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 5.8823, "loss/crossentropy": 2.256209373474121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3315662145614624, "step": 1432 }, { "epoch": 0.02868, "grad_norm": 2.9375, "grad_norm_var": 0.030594889322916666, "learning_rate": 0.0001, "loss": 5.2821, "loss/crossentropy": 2.150561034679413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33046241104602814, "step": 1434 }, { "epoch": 0.02872, "grad_norm": 3.078125, "grad_norm_var": 0.024898274739583334, "learning_rate": 0.0001, "loss": 5.3609, "loss/crossentropy": 2.279554605484009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3621693551540375, "step": 1436 }, { "epoch": 0.02876, "grad_norm": 3.015625, "grad_norm_var": 0.0259918212890625, "learning_rate": 0.0001, "loss": 5.4323, "loss/crossentropy": 2.0183790922164917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33361808955669403, "step": 1438 }, { "epoch": 0.0288, "grad_norm": 4.15625, "grad_norm_var": 0.7352203369140625, "learning_rate": 0.0001, "loss": 5.4781, "loss/crossentropy": 1.964252531528473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31638333201408386, "step": 1440 }, { "epoch": 0.02884, "grad_norm": 3.046875, "grad_norm_var": 0.73902587890625, "learning_rate": 0.0001, "loss": 5.6907, "loss/crossentropy": 1.9271634817123413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3408525586128235, "step": 1442 }, { "epoch": 0.02888, "grad_norm": 2.953125, "grad_norm_var": 0.7411417643229167, "learning_rate": 0.0001, "loss": 5.4869, "loss/crossentropy": 2.4400887489318848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33455249667167664, "step": 1444 }, { "epoch": 0.02892, "grad_norm": 3.078125, "grad_norm_var": 0.7270904541015625, "learning_rate": 0.0001, "loss": 5.6179, "loss/crossentropy": 2.465711832046509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3528379648923874, "step": 1446 }, { "epoch": 0.02896, "grad_norm": 3.328125, "grad_norm_var": 0.7147288004557292, "learning_rate": 0.0001, "loss": 5.5125, "loss/crossentropy": 2.269619941711426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32898105680942535, "step": 1448 }, { "epoch": 0.029, "grad_norm": 2.984375, "grad_norm_var": 0.68287353515625, "learning_rate": 0.0001, "loss": 5.5775, "loss/crossentropy": 2.0418076515197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3170912265777588, "step": 1450 }, { "epoch": 0.02904, "grad_norm": 3.15625, "grad_norm_var": 0.6841756184895833, "learning_rate": 0.0001, "loss": 5.5964, "loss/crossentropy": 2.291188359260559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35694004595279694, "step": 1452 }, { "epoch": 0.02908, "grad_norm": 2.75, "grad_norm_var": 0.7253163655598959, "learning_rate": 0.0001, "loss": 5.1608, "loss/crossentropy": 2.497802972793579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3271156847476959, "step": 1454 }, { "epoch": 0.02912, "grad_norm": 2.984375, "grad_norm_var": 0.0334625244140625, "learning_rate": 0.0001, "loss": 5.1527, "loss/crossentropy": 2.161794900894165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32509492337703705, "step": 1456 }, { "epoch": 0.02916, "grad_norm": 3.3125, "grad_norm_var": 0.03470052083333333, "learning_rate": 0.0001, "loss": 5.7939, "loss/crossentropy": 2.632015347480774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36461199820041656, "step": 1458 }, { "epoch": 0.0292, "grad_norm": 3.03125, "grad_norm_var": 0.03372395833333333, "learning_rate": 0.0001, "loss": 5.6233, "loss/crossentropy": 2.2478749752044678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3311367332935333, "step": 1460 }, { "epoch": 0.02924, "grad_norm": 4.03125, "grad_norm_var": 0.09263916015625, "learning_rate": 0.0001, "loss": 5.2528, "loss/crossentropy": 1.8451723456382751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31802159547805786, "step": 1462 }, { "epoch": 0.02928, "grad_norm": 3.71875, "grad_norm_var": 0.11772359212239583, "learning_rate": 0.0001, "loss": 5.6469, "loss/crossentropy": 2.6684207916259766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35483625531196594, "step": 1464 }, { "epoch": 0.02932, "grad_norm": 2.921875, "grad_norm_var": 0.11543680826822916, "learning_rate": 0.0001, "loss": 5.2866, "loss/crossentropy": 2.497614622116089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3422156721353531, "step": 1466 }, { "epoch": 0.02936, "grad_norm": 4.25, "grad_norm_var": 0.19032796223958334, "learning_rate": 0.0001, "loss": 5.7424, "loss/crossentropy": 2.750740170478821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.377558171749115, "step": 1468 }, { "epoch": 0.0294, "grad_norm": 2.8125, "grad_norm_var": 0.17550455729166667, "learning_rate": 0.0001, "loss": 5.1741, "loss/crossentropy": 1.9961607456207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3166217654943466, "step": 1470 }, { "epoch": 0.02944, "grad_norm": 3.359375, "grad_norm_var": 0.1655914306640625, "learning_rate": 0.0001, "loss": 5.3114, "loss/crossentropy": 2.0874768495559692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2912164479494095, "step": 1472 }, { "epoch": 0.02948, "grad_norm": 2.953125, "grad_norm_var": 0.174072265625, "learning_rate": 0.0001, "loss": 5.2102, "loss/crossentropy": 2.112182080745697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29954925179481506, "step": 1474 }, { "epoch": 0.02952, "grad_norm": 3.6875, "grad_norm_var": 0.4074371337890625, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.1319644451141357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3002544492483139, "step": 1476 }, { "epoch": 0.02956, "grad_norm": 3.0625, "grad_norm_var": 0.37743733723958334, "learning_rate": 0.0001, "loss": 5.8305, "loss/crossentropy": 2.3029643297195435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3262677788734436, "step": 1478 }, { "epoch": 0.0296, "grad_norm": 3.15625, "grad_norm_var": 0.3826243082682292, "learning_rate": 0.0001, "loss": 5.6803, "loss/crossentropy": 2.8598941564559937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.38816460967063904, "step": 1480 }, { "epoch": 0.02964, "grad_norm": 2.984375, "grad_norm_var": 0.37844950358072915, "learning_rate": 0.0001, "loss": 5.2177, "loss/crossentropy": 2.134063720703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3128468096256256, "step": 1482 }, { "epoch": 0.02968, "grad_norm": 3.125, "grad_norm_var": 0.31961263020833336, "learning_rate": 0.0001, "loss": 5.5234, "loss/crossentropy": 2.481287717819214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3209179639816284, "step": 1484 }, { "epoch": 0.02972, "grad_norm": 3.3125, "grad_norm_var": 0.30549723307291665, "learning_rate": 0.0001, "loss": 5.3571, "loss/crossentropy": 2.0571895837783813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3121480941772461, "step": 1486 }, { "epoch": 0.02976, "grad_norm": 3.203125, "grad_norm_var": 0.30614827473958334, "learning_rate": 0.0001, "loss": 5.2725, "loss/crossentropy": 2.1930073499679565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3031492233276367, "step": 1488 }, { "epoch": 0.0298, "grad_norm": 3.046875, "grad_norm_var": 0.30426025390625, "learning_rate": 0.0001, "loss": 5.3256, "loss/crossentropy": 2.403902530670166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32206277549266815, "step": 1490 }, { "epoch": 0.02984, "grad_norm": 2.9375, "grad_norm_var": 0.0538482666015625, "learning_rate": 0.0001, "loss": 5.2859, "loss/crossentropy": 2.131115198135376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31651052832603455, "step": 1492 }, { "epoch": 0.02988, "grad_norm": 2.828125, "grad_norm_var": 0.05826416015625, "learning_rate": 0.0001, "loss": 5.1232, "loss/crossentropy": 2.024750769138336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3142934888601303, "step": 1494 }, { "epoch": 0.02992, "grad_norm": 2.984375, "grad_norm_var": 0.06279195149739583, "learning_rate": 0.0001, "loss": 5.3804, "loss/crossentropy": 1.9606707692146301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28610387444496155, "step": 1496 }, { "epoch": 0.02996, "grad_norm": 3.15625, "grad_norm_var": 0.062474568684895836, "learning_rate": 0.0001, "loss": 5.5731, "loss/crossentropy": 2.31516432762146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32875190675258636, "step": 1498 }, { "epoch": 0.03, "grad_norm": 3.359375, "grad_norm_var": 0.0711822509765625, "learning_rate": 0.0001, "loss": 5.7626, "loss/crossentropy": 2.295942783355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3379499167203903, "step": 1500 }, { "epoch": 0.03004, "grad_norm": 3.21875, "grad_norm_var": 0.06961263020833333, "learning_rate": 0.0001, "loss": 5.5997, "loss/crossentropy": 2.1859577894210815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32143887877464294, "step": 1502 }, { "epoch": 0.03008, "grad_norm": 3.28125, "grad_norm_var": 0.07023824055989583, "learning_rate": 0.0001, "loss": 5.4159, "loss/crossentropy": 2.1852502822875977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3582807630300522, "step": 1504 }, { "epoch": 0.03012, "grad_norm": 2.75, "grad_norm_var": 0.07810770670572917, "learning_rate": 0.0001, "loss": 5.3011, "loss/crossentropy": 2.174897611141205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33067959547042847, "step": 1506 }, { "epoch": 0.03016, "grad_norm": 2.96875, "grad_norm_var": 0.04057515462239583, "learning_rate": 0.0001, "loss": 5.2775, "loss/crossentropy": 2.24343740940094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34563779830932617, "step": 1508 }, { "epoch": 0.0302, "grad_norm": 2.90625, "grad_norm_var": 0.043473307291666666, "learning_rate": 0.0001, "loss": 5.317, "loss/crossentropy": 1.9828822612762451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2887475937604904, "step": 1510 }, { "epoch": 0.03024, "grad_norm": 3.0, "grad_norm_var": 0.03655192057291667, "learning_rate": 0.0001, "loss": 5.3172, "loss/crossentropy": 2.2110393047332764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32290786504745483, "step": 1512 }, { "epoch": 0.03028, "grad_norm": 2.90625, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 5.2598, "loss/crossentropy": 2.3797603845596313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3262799382209778, "step": 1514 }, { "epoch": 0.03032, "grad_norm": 3.1875, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 5.4107, "loss/crossentropy": 2.0183085799217224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2915680408477783, "step": 1516 }, { "epoch": 0.03036, "grad_norm": 3.78125, "grad_norm_var": 0.9888631184895833, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 1.875212013721466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30001460015773773, "step": 1518 }, { "epoch": 0.0304, "grad_norm": 3.0625, "grad_norm_var": 0.9916300455729167, "learning_rate": 0.0001, "loss": 5.4406, "loss/crossentropy": 2.1000564098358154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.354188472032547, "step": 1520 }, { "epoch": 0.03044, "grad_norm": 2.84375, "grad_norm_var": 0.980126953125, "learning_rate": 0.0001, "loss": 5.4837, "loss/crossentropy": 2.071643114089966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.333335280418396, "step": 1522 }, { "epoch": 0.03048, "grad_norm": 3.203125, "grad_norm_var": 0.9749664306640625, "learning_rate": 0.0001, "loss": 5.2716, "loss/crossentropy": 2.4253947734832764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31365686655044556, "step": 1524 }, { "epoch": 0.03052, "grad_norm": 3.0625, "grad_norm_var": 0.9605133056640625, "learning_rate": 0.0001, "loss": 5.1601, "loss/crossentropy": 1.967090904712677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30677899718284607, "step": 1526 }, { "epoch": 0.03056, "grad_norm": 3.15625, "grad_norm_var": 0.954443359375, "learning_rate": 0.0001, "loss": 5.0971, "loss/crossentropy": 2.112701952457428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3029911667108536, "step": 1528 }, { "epoch": 0.0306, "grad_norm": 4.5625, "grad_norm_var": 1.0084269205729166, "learning_rate": 0.0001, "loss": 5.6836, "loss/crossentropy": 2.5657063722610474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3484763503074646, "step": 1530 }, { "epoch": 0.03064, "grad_norm": 3.234375, "grad_norm_var": 1.110497029622396, "learning_rate": 0.0001, "loss": 5.3888, "loss/crossentropy": 2.1214585304260254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3243858218193054, "step": 1532 }, { "epoch": 0.03068, "grad_norm": 3.234375, "grad_norm_var": 0.35461832682291666, "learning_rate": 0.0001, "loss": 5.4662, "loss/crossentropy": 2.427902936935425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3428474962711334, "step": 1534 }, { "epoch": 0.03072, "grad_norm": 3.078125, "grad_norm_var": 0.36741434733072914, "learning_rate": 0.0001, "loss": 5.2956, "loss/crossentropy": 1.975690484046936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3136949688196182, "step": 1536 }, { "epoch": 0.03076, "grad_norm": 2.734375, "grad_norm_var": 0.38342692057291666, "learning_rate": 0.0001, "loss": 5.1233, "loss/crossentropy": 2.295845150947571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31703390181064606, "step": 1538 }, { "epoch": 0.0308, "grad_norm": 3.171875, "grad_norm_var": 0.37892252604166665, "learning_rate": 0.0001, "loss": 5.8286, "loss/crossentropy": 2.117497444152832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32601243257522583, "step": 1540 }, { "epoch": 0.03084, "grad_norm": 3.140625, "grad_norm_var": 0.49339192708333335, "learning_rate": 0.0001, "loss": 5.4065, "loss/crossentropy": 2.3824862241744995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3332519829273224, "step": 1542 }, { "epoch": 0.03088, "grad_norm": 3.140625, "grad_norm_var": 0.4940582275390625, "learning_rate": 0.0001, "loss": 5.0672, "loss/crossentropy": 1.9037857055664062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3143462985754013, "step": 1544 }, { "epoch": 0.03092, "grad_norm": 3.09375, "grad_norm_var": 0.4064737955729167, "learning_rate": 0.0001, "loss": 5.2328, "loss/crossentropy": 1.9191133379936218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29443541169166565, "step": 1546 }, { "epoch": 0.03096, "grad_norm": 2.734375, "grad_norm_var": 0.28172200520833335, "learning_rate": 0.0001, "loss": 5.3402, "loss/crossentropy": 2.216462254524231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31411711871623993, "step": 1548 }, { "epoch": 0.031, "grad_norm": 2.96875, "grad_norm_var": 0.227685546875, "learning_rate": 0.0001, "loss": 5.0455, "loss/crossentropy": 1.8064754605293274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667318135499954, "step": 1550 }, { "epoch": 0.03104, "grad_norm": 3.453125, "grad_norm_var": 0.46340738932291664, "learning_rate": 0.0001, "loss": 5.5672, "loss/crossentropy": 2.488176465034485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3422655761241913, "step": 1552 }, { "epoch": 0.03108, "grad_norm": 2.859375, "grad_norm_var": 0.4529205322265625, "learning_rate": 0.0001, "loss": 5.4404, "loss/crossentropy": 2.5670164823532104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3462950587272644, "step": 1554 }, { "epoch": 0.03112, "grad_norm": 3.25, "grad_norm_var": 0.46499735514322915, "learning_rate": 0.0001, "loss": 5.0295, "loss/crossentropy": 2.0630581378936768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29241877794265747, "step": 1556 }, { "epoch": 0.03116, "grad_norm": 2.84375, "grad_norm_var": 0.30891011555989584, "learning_rate": 0.0001, "loss": 5.4751, "loss/crossentropy": 2.685954213142395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37143297493457794, "step": 1558 }, { "epoch": 0.0312, "grad_norm": 3.421875, "grad_norm_var": 0.311669921875, "learning_rate": 0.0001, "loss": 5.6995, "loss/crossentropy": 1.9786988496780396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40276341140270233, "step": 1560 }, { "epoch": 0.03124, "grad_norm": 3.0, "grad_norm_var": 0.31383056640625, "learning_rate": 0.0001, "loss": 5.6695, "loss/crossentropy": 2.1484411358833313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3264364004135132, "step": 1562 }, { "epoch": 0.03128, "grad_norm": 2.875, "grad_norm_var": 0.3103424072265625, "learning_rate": 0.0001, "loss": 5.3015, "loss/crossentropy": 2.1411852836608887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3156583160161972, "step": 1564 }, { "epoch": 0.03132, "grad_norm": 2.90625, "grad_norm_var": 0.31302083333333336, "learning_rate": 0.0001, "loss": 5.2911, "loss/crossentropy": 2.1509006023406982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3066476732492447, "step": 1566 }, { "epoch": 0.03136, "grad_norm": 2.90625, "grad_norm_var": 0.0416412353515625, "learning_rate": 0.0001, "loss": 5.1904, "loss/crossentropy": 1.7540676593780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705047130584717, "step": 1568 }, { "epoch": 0.0314, "grad_norm": 3.046875, "grad_norm_var": 0.0431640625, "learning_rate": 0.0001, "loss": 4.9637, "loss/crossentropy": 2.2091184854507446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2920738309621811, "step": 1570 }, { "epoch": 0.03144, "grad_norm": 2.984375, "grad_norm_var": 0.045735677083333336, "learning_rate": 0.0001, "loss": 5.6048, "loss/crossentropy": 1.8405091762542725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981649935245514, "step": 1572 }, { "epoch": 0.03148, "grad_norm": 2.6875, "grad_norm_var": 0.0420806884765625, "learning_rate": 0.0001, "loss": 5.4258, "loss/crossentropy": 2.2292457818984985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30654460191726685, "step": 1574 }, { "epoch": 0.03152, "grad_norm": 4.21875, "grad_norm_var": 0.12868550618489583, "learning_rate": 0.0001, "loss": 5.8439, "loss/crossentropy": 2.653907895088196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35303865373134613, "step": 1576 }, { "epoch": 0.03156, "grad_norm": 3.234375, "grad_norm_var": 0.13888346354166667, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.6799376010894775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37383997440338135, "step": 1578 }, { "epoch": 0.0316, "grad_norm": 2.828125, "grad_norm_var": 0.14431864420572918, "learning_rate": 0.0001, "loss": 5.3321, "loss/crossentropy": 2.3438535928726196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30706922709941864, "step": 1580 }, { "epoch": 0.03164, "grad_norm": 2.96875, "grad_norm_var": 0.14254150390625, "learning_rate": 0.0001, "loss": 5.2139, "loss/crossentropy": 2.1885964274406433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30707718431949615, "step": 1582 }, { "epoch": 0.03168, "grad_norm": 2.984375, "grad_norm_var": 0.14042561848958332, "learning_rate": 0.0001, "loss": 5.1661, "loss/crossentropy": 2.0471584796905518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31723180413246155, "step": 1584 }, { "epoch": 0.03172, "grad_norm": 3.0, "grad_norm_var": 0.1363433837890625, "learning_rate": 0.0001, "loss": 5.227, "loss/crossentropy": 2.135176420211792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29839280247688293, "step": 1586 }, { "epoch": 0.03176, "grad_norm": 3.3125, "grad_norm_var": 0.13482666015625, "learning_rate": 0.0001, "loss": 5.6412, "loss/crossentropy": 2.4444775581359863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33574284613132477, "step": 1588 }, { "epoch": 0.0318, "grad_norm": 3.140625, "grad_norm_var": 0.12195536295572916, "learning_rate": 0.0001, "loss": 5.4289, "loss/crossentropy": 2.1725653409957886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.310588076710701, "step": 1590 }, { "epoch": 0.03184, "grad_norm": 2.875, "grad_norm_var": 0.04572652180989583, "learning_rate": 0.0001, "loss": 5.3727, "loss/crossentropy": 2.3610929250717163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32158929109573364, "step": 1592 }, { "epoch": 0.03188, "grad_norm": 2.84375, "grad_norm_var": 0.0502105712890625, "learning_rate": 0.0001, "loss": 4.8794, "loss/crossentropy": 1.9271156787872314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28283432126045227, "step": 1594 }, { "epoch": 0.03192, "grad_norm": 3.203125, "grad_norm_var": 0.04372456868489583, "learning_rate": 0.0001, "loss": 5.3912, "loss/crossentropy": 2.4196890592575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3463610112667084, "step": 1596 }, { "epoch": 0.03196, "grad_norm": 2.71875, "grad_norm_var": 0.0500152587890625, "learning_rate": 0.0001, "loss": 5.1524, "loss/crossentropy": 2.207236647605896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33829018473625183, "step": 1598 }, { "epoch": 0.032, "grad_norm": 3.0625, "grad_norm_var": 0.04101155598958333, "learning_rate": 0.0001, "loss": 5.4724, "loss/crossentropy": 2.3757678270339966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3342677056789398, "step": 1600 }, { "epoch": 0.03204, "grad_norm": 2.921875, "grad_norm_var": 0.04512430826822917, "learning_rate": 0.0001, "loss": 5.1763, "loss/crossentropy": 2.2605016231536865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3062159866094589, "step": 1602 }, { "epoch": 0.03208, "grad_norm": 3.359375, "grad_norm_var": 0.04794921875, "learning_rate": 0.0001, "loss": 5.7139, "loss/crossentropy": 2.4536768198013306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36668023467063904, "step": 1604 }, { "epoch": 0.03212, "grad_norm": 3.09375, "grad_norm_var": 0.0523834228515625, "learning_rate": 0.0001, "loss": 5.137, "loss/crossentropy": 1.9870036244392395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2974477708339691, "step": 1606 }, { "epoch": 0.03216, "grad_norm": 3.015625, "grad_norm_var": 0.039876302083333336, "learning_rate": 0.0001, "loss": 5.3926, "loss/crossentropy": 2.1852506399154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3327452540397644, "step": 1608 }, { "epoch": 0.0322, "grad_norm": 3.140625, "grad_norm_var": 0.03128255208333333, "learning_rate": 0.0001, "loss": 5.4964, "loss/crossentropy": 2.2226197719573975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3110218793153763, "step": 1610 }, { "epoch": 0.03224, "grad_norm": 3.09375, "grad_norm_var": 0.029523722330729165, "learning_rate": 0.0001, "loss": 5.398, "loss/crossentropy": 1.8255922198295593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28216174244880676, "step": 1612 }, { "epoch": 0.03228, "grad_norm": 2.96875, "grad_norm_var": 0.023200480143229167, "learning_rate": 0.0001, "loss": 5.2777, "loss/crossentropy": 1.9663920998573303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3095496743917465, "step": 1614 }, { "epoch": 0.03232, "grad_norm": 2.953125, "grad_norm_var": 0.023908487955729165, "learning_rate": 0.0001, "loss": 5.1578, "loss/crossentropy": 2.2089942693710327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3307983875274658, "step": 1616 }, { "epoch": 0.03236, "grad_norm": 3.03125, "grad_norm_var": 0.01793212890625, "learning_rate": 0.0001, "loss": 5.3331, "loss/crossentropy": 2.261039137840271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.314799427986145, "step": 1618 }, { "epoch": 0.0324, "grad_norm": 2.859375, "grad_norm_var": 0.015555826822916667, "learning_rate": 0.0001, "loss": 5.1674, "loss/crossentropy": 2.350824236869812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3515756279230118, "step": 1620 }, { "epoch": 0.03244, "grad_norm": 2.9375, "grad_norm_var": 0.010749308268229167, "learning_rate": 0.0001, "loss": 5.4853, "loss/crossentropy": 2.2964736223220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3059935122728348, "step": 1622 }, { "epoch": 0.03248, "grad_norm": 2.859375, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 5.2503, "loss/crossentropy": 2.36459481716156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33763329684734344, "step": 1624 }, { "epoch": 0.03252, "grad_norm": 2.984375, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 5.2242, "loss/crossentropy": 2.165920853614807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31318019330501556, "step": 1626 }, { "epoch": 0.03256, "grad_norm": 2.859375, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 5.2544, "loss/crossentropy": 2.326790690422058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3044355511665344, "step": 1628 }, { "epoch": 0.0326, "grad_norm": 2.96875, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 5.3369, "loss/crossentropy": 1.8848688006401062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.306812584400177, "step": 1630 }, { "epoch": 0.03264, "grad_norm": 2.953125, "grad_norm_var": 0.0134185791015625, "learning_rate": 0.0001, "loss": 5.351, "loss/crossentropy": 2.045863091945648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091462701559067, "step": 1632 }, { "epoch": 0.03268, "grad_norm": 2.78125, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 4.9742, "loss/crossentropy": 2.0707273483276367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3038959503173828, "step": 1634 }, { "epoch": 0.03272, "grad_norm": 2.859375, "grad_norm_var": 0.011237589518229167, "learning_rate": 0.0001, "loss": 5.2076, "loss/crossentropy": 2.0786932706832886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29513464868068695, "step": 1636 }, { "epoch": 0.03276, "grad_norm": 3.203125, "grad_norm_var": 0.01900634765625, "learning_rate": 0.0001, "loss": 5.4237, "loss/crossentropy": 2.1527108550071716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32102274894714355, "step": 1638 }, { "epoch": 0.0328, "grad_norm": 3.15625, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 5.126, "loss/crossentropy": 2.0383081436157227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3253529220819473, "step": 1640 }, { "epoch": 0.03284, "grad_norm": 2.734375, "grad_norm_var": 0.021809895833333332, "learning_rate": 0.0001, "loss": 5.0775, "loss/crossentropy": 2.2801902294158936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32200203835964203, "step": 1642 }, { "epoch": 0.03288, "grad_norm": 2.953125, "grad_norm_var": 0.022184244791666665, "learning_rate": 0.0001, "loss": 5.3035, "loss/crossentropy": 2.164717435836792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2997436225414276, "step": 1644 }, { "epoch": 0.03292, "grad_norm": 3.1875, "grad_norm_var": 0.025191243489583334, "learning_rate": 0.0001, "loss": 5.5166, "loss/crossentropy": 2.389414072036743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32998231053352356, "step": 1646 }, { "epoch": 0.03296, "grad_norm": 2.921875, "grad_norm_var": 0.023949178059895833, "learning_rate": 0.0001, "loss": 5.3225, "loss/crossentropy": 2.09418523311615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33829881250858307, "step": 1648 }, { "epoch": 0.033, "grad_norm": 3.109375, "grad_norm_var": 0.021800740559895834, "learning_rate": 0.0001, "loss": 5.261, "loss/crossentropy": 2.324030637741089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30337512493133545, "step": 1650 }, { "epoch": 0.03304, "grad_norm": 3.109375, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 5.4162, "loss/crossentropy": 1.8635556101799011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27077023684978485, "step": 1652 }, { "epoch": 0.03308, "grad_norm": 2.8125, "grad_norm_var": 0.0215972900390625, "learning_rate": 0.0001, "loss": 5.2371, "loss/crossentropy": 2.1776190996170044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30546560883522034, "step": 1654 }, { "epoch": 0.03312, "grad_norm": 3.1875, "grad_norm_var": 0.021240234375, "learning_rate": 0.0001, "loss": 5.1721, "loss/crossentropy": 2.1003682613372803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3304767310619354, "step": 1656 }, { "epoch": 0.03316, "grad_norm": 3.203125, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 5.8029, "loss/crossentropy": 2.4331823587417603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3653264045715332, "step": 1658 }, { "epoch": 0.0332, "grad_norm": 3.125, "grad_norm_var": 0.1084381103515625, "learning_rate": 0.0001, "loss": 5.7157, "loss/crossentropy": 1.9939777851104736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28948865830898285, "step": 1660 }, { "epoch": 0.03324, "grad_norm": 2.9375, "grad_norm_var": 0.10896809895833333, "learning_rate": 0.0001, "loss": 4.933, "loss/crossentropy": 1.9093859791755676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3007328063249588, "step": 1662 }, { "epoch": 0.03328, "grad_norm": 2.734375, "grad_norm_var": 0.11998697916666666, "learning_rate": 0.0001, "loss": 5.1861, "loss/crossentropy": 2.2847355604171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2975463569164276, "step": 1664 }, { "epoch": 0.03332, "grad_norm": 3.046875, "grad_norm_var": 0.120166015625, "learning_rate": 0.0001, "loss": 5.3623, "loss/crossentropy": 2.0081310868263245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28596948087215424, "step": 1666 }, { "epoch": 0.03336, "grad_norm": 3.125, "grad_norm_var": 0.11974283854166666, "learning_rate": 0.0001, "loss": 5.2338, "loss/crossentropy": 2.189584493637085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30838510394096375, "step": 1668 }, { "epoch": 0.0334, "grad_norm": 2.796875, "grad_norm_var": 0.12802327473958333, "learning_rate": 0.0001, "loss": 5.2749, "loss/crossentropy": 2.204169988632202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091500401496887, "step": 1670 }, { "epoch": 0.03344, "grad_norm": 2.75, "grad_norm_var": 0.1338775634765625, "learning_rate": 0.0001, "loss": 5.2755, "loss/crossentropy": 2.195966899394989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30603383481502533, "step": 1672 }, { "epoch": 0.03348, "grad_norm": 3.0, "grad_norm_var": 0.1251953125, "learning_rate": 0.0001, "loss": 5.8768, "loss/crossentropy": 2.5402153730392456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3657945841550827, "step": 1674 }, { "epoch": 0.03352, "grad_norm": 3.09375, "grad_norm_var": 0.0277252197265625, "learning_rate": 0.0001, "loss": 5.5387, "loss/crossentropy": 2.1721729040145874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29539716243743896, "step": 1676 }, { "epoch": 0.03356, "grad_norm": 2.953125, "grad_norm_var": 0.027253214518229166, "learning_rate": 0.0001, "loss": 5.2355, "loss/crossentropy": 2.0591543912887573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29050062596797943, "step": 1678 }, { "epoch": 0.0336, "grad_norm": 2.765625, "grad_norm_var": 0.025846354166666665, "learning_rate": 0.0001, "loss": 5.4895, "loss/crossentropy": 2.1396639347076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3261077404022217, "step": 1680 }, { "epoch": 0.03364, "grad_norm": 3.015625, "grad_norm_var": 0.024689737955729166, "learning_rate": 0.0001, "loss": 5.2094, "loss/crossentropy": 1.9553123712539673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3204822689294815, "step": 1682 }, { "epoch": 0.03368, "grad_norm": 2.78125, "grad_norm_var": 0.023746744791666666, "learning_rate": 0.0001, "loss": 5.3417, "loss/crossentropy": 2.4139883518218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32602658867836, "step": 1684 }, { "epoch": 0.03372, "grad_norm": 2.90625, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 5.3479, "loss/crossentropy": 1.9849293231964111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32610756158828735, "step": 1686 }, { "epoch": 0.03376, "grad_norm": 2.953125, "grad_norm_var": 0.008622233072916667, "learning_rate": 0.0001, "loss": 5.6218, "loss/crossentropy": 2.698970675468445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3749641329050064, "step": 1688 }, { "epoch": 0.0338, "grad_norm": 3.0625, "grad_norm_var": 0.01285400390625, "learning_rate": 0.0001, "loss": 5.7617, "loss/crossentropy": 2.715620517730713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35715436935424805, "step": 1690 }, { "epoch": 0.03384, "grad_norm": 3.0, "grad_norm_var": 0.01201171875, "learning_rate": 0.0001, "loss": 5.5073, "loss/crossentropy": 2.7213666439056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34585659205913544, "step": 1692 }, { "epoch": 0.03388, "grad_norm": 2.84375, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 5.2674, "loss/crossentropy": 2.277606725692749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3140410780906677, "step": 1694 }, { "epoch": 0.03392, "grad_norm": 3.765625, "grad_norm_var": 0.05364176432291667, "learning_rate": 0.0001, "loss": 5.2633, "loss/crossentropy": 2.332197904586792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30997559428215027, "step": 1696 }, { "epoch": 0.03396, "grad_norm": 2.953125, "grad_norm_var": 0.05464579264322917, "learning_rate": 0.0001, "loss": 5.4323, "loss/crossentropy": 2.4413230419158936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3075388967990875, "step": 1698 }, { "epoch": 0.034, "grad_norm": 2.859375, "grad_norm_var": 0.0567535400390625, "learning_rate": 0.0001, "loss": 5.1099, "loss/crossentropy": 2.2601696252822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3050367534160614, "step": 1700 }, { "epoch": 0.03404, "grad_norm": 2.640625, "grad_norm_var": 0.06572977701822917, "learning_rate": 0.0001, "loss": 4.9925, "loss/crossentropy": 2.2910414934158325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3248990923166275, "step": 1702 }, { "epoch": 0.03408, "grad_norm": 2.890625, "grad_norm_var": 0.06843159993489584, "learning_rate": 0.0001, "loss": 5.3814, "loss/crossentropy": 1.9898682832717896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27573561668395996, "step": 1704 }, { "epoch": 0.03412, "grad_norm": 2.96875, "grad_norm_var": 0.06398824055989584, "learning_rate": 0.0001, "loss": 5.1506, "loss/crossentropy": 2.1234602332115173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30724021792411804, "step": 1706 }, { "epoch": 0.03416, "grad_norm": 3.390625, "grad_norm_var": 0.0764801025390625, "learning_rate": 0.0001, "loss": 5.5433, "loss/crossentropy": 2.34807026386261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32687780261039734, "step": 1708 }, { "epoch": 0.0342, "grad_norm": 2.71875, "grad_norm_var": 0.07669169108072917, "learning_rate": 0.0001, "loss": 5.1249, "loss/crossentropy": 2.17264860868454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3086177706718445, "step": 1710 }, { "epoch": 0.03424, "grad_norm": 2.96875, "grad_norm_var": 0.0303131103515625, "learning_rate": 0.0001, "loss": 5.3613, "loss/crossentropy": 2.1094497442245483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29483039677143097, "step": 1712 }, { "epoch": 0.03428, "grad_norm": 2.71875, "grad_norm_var": 0.03284505208333333, "learning_rate": 0.0001, "loss": 5.256, "loss/crossentropy": 2.2379074692726135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2799292802810669, "step": 1714 }, { "epoch": 0.03432, "grad_norm": 3.078125, "grad_norm_var": 0.21389567057291667, "learning_rate": 0.0001, "loss": 5.5834, "loss/crossentropy": 2.4616905450820923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33721986413002014, "step": 1716 }, { "epoch": 0.03436, "grad_norm": 2.953125, "grad_norm_var": 0.2042877197265625, "learning_rate": 0.0001, "loss": 5.4415, "loss/crossentropy": 2.383226990699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3242805302143097, "step": 1718 }, { "epoch": 0.0344, "grad_norm": 2.921875, "grad_norm_var": 0.19931538899739584, "learning_rate": 0.0001, "loss": 5.5514, "loss/crossentropy": 2.495948314666748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3131762146949768, "step": 1720 }, { "epoch": 0.03444, "grad_norm": 3.109375, "grad_norm_var": 0.19524739583333334, "learning_rate": 0.0001, "loss": 5.6767, "loss/crossentropy": 2.1921653747558594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.320631667971611, "step": 1722 }, { "epoch": 0.03448, "grad_norm": 3.40625, "grad_norm_var": 0.20221354166666666, "learning_rate": 0.0001, "loss": 5.2, "loss/crossentropy": 2.0795475840568542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29602357745170593, "step": 1724 }, { "epoch": 0.03452, "grad_norm": 2.75, "grad_norm_var": 0.201171875, "learning_rate": 0.0001, "loss": 4.7489, "loss/crossentropy": 1.911207377910614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.287412166595459, "step": 1726 }, { "epoch": 0.03456, "grad_norm": 2.96875, "grad_norm_var": 0.20129801432291666, "learning_rate": 0.0001, "loss": 5.155, "loss/crossentropy": 2.0638214349746704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32240423560142517, "step": 1728 }, { "epoch": 0.0346, "grad_norm": 2.75, "grad_norm_var": 0.20066630045572917, "learning_rate": 0.0001, "loss": 5.3464, "loss/crossentropy": 2.355573534965515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3161381185054779, "step": 1730 }, { "epoch": 0.03464, "grad_norm": 2.8125, "grad_norm_var": 0.04185282389322917, "learning_rate": 0.0001, "loss": 5.1039, "loss/crossentropy": 2.2227123975753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31909704208374023, "step": 1732 }, { "epoch": 0.03468, "grad_norm": 2.765625, "grad_norm_var": 0.03954671223958333, "learning_rate": 0.0001, "loss": 5.2249, "loss/crossentropy": 2.203602910041809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3238847255706787, "step": 1734 }, { "epoch": 0.03472, "grad_norm": 3.34375, "grad_norm_var": 0.07062174479166666, "learning_rate": 0.0001, "loss": 5.0623, "loss/crossentropy": 2.2696332335472107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3174774497747421, "step": 1736 }, { "epoch": 0.03476, "grad_norm": 2.78125, "grad_norm_var": 0.07385660807291666, "learning_rate": 0.0001, "loss": 5.0469, "loss/crossentropy": 1.8124465942382812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2645348161458969, "step": 1738 }, { "epoch": 0.0348, "grad_norm": 3.015625, "grad_norm_var": 0.0578765869140625, "learning_rate": 0.0001, "loss": 5.589, "loss/crossentropy": 2.1951464414596558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31606370210647583, "step": 1740 }, { "epoch": 0.03484, "grad_norm": 3.109375, "grad_norm_var": 0.05624593098958333, "learning_rate": 0.0001, "loss": 5.4867, "loss/crossentropy": 2.4413230419158936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33688417077064514, "step": 1742 }, { "epoch": 0.03488, "grad_norm": 3.09375, "grad_norm_var": 0.05705973307291667, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.1357219219207764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3476516157388687, "step": 1744 }, { "epoch": 0.03492, "grad_norm": 2.859375, "grad_norm_var": 0.0551666259765625, "learning_rate": 0.0001, "loss": 5.4465, "loss/crossentropy": 2.1557592153549194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3080083876848221, "step": 1746 }, { "epoch": 0.03496, "grad_norm": 2.859375, "grad_norm_var": 0.0512603759765625, "learning_rate": 0.0001, "loss": 5.4949, "loss/crossentropy": 2.3549705743789673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33814217150211334, "step": 1748 }, { "epoch": 0.035, "grad_norm": 3.109375, "grad_norm_var": 0.04348958333333333, "learning_rate": 0.0001, "loss": 5.5881, "loss/crossentropy": 2.382844924926758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31944599747657776, "step": 1750 }, { "epoch": 0.03504, "grad_norm": 2.765625, "grad_norm_var": 0.02138671875, "learning_rate": 0.0001, "loss": 5.2937, "loss/crossentropy": 2.3312920331954956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3346693813800812, "step": 1752 }, { "epoch": 0.03508, "grad_norm": 3.015625, "grad_norm_var": 0.019677734375, "learning_rate": 0.0001, "loss": 5.1981, "loss/crossentropy": 2.1921491026878357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31133508682250977, "step": 1754 }, { "epoch": 0.03512, "grad_norm": 2.71875, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 4.8796, "loss/crossentropy": 2.0229611992836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26832816004753113, "step": 1756 }, { "epoch": 0.03516, "grad_norm": 2.75, "grad_norm_var": 0.025764973958333333, "learning_rate": 0.0001, "loss": 5.0322, "loss/crossentropy": 1.8138108849525452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576165944337845, "step": 1758 }, { "epoch": 0.0352, "grad_norm": 2.75, "grad_norm_var": 0.0232574462890625, "learning_rate": 0.0001, "loss": 5.1276, "loss/crossentropy": 2.0019126534461975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28094957768917084, "step": 1760 }, { "epoch": 0.03524, "grad_norm": 2.703125, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 5.1776, "loss/crossentropy": 2.400240898132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30936548113822937, "step": 1762 }, { "epoch": 0.03528, "grad_norm": 3.328125, "grad_norm_var": 0.03495992024739583, "learning_rate": 0.0001, "loss": 4.9401, "loss/crossentropy": 2.019958734512329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27741560339927673, "step": 1764 }, { "epoch": 0.03532, "grad_norm": 2.875, "grad_norm_var": 0.025032552083333333, "learning_rate": 0.0001, "loss": 5.0912, "loss/crossentropy": 1.9099596738815308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28769225627183914, "step": 1766 }, { "epoch": 0.03536, "grad_norm": 2.890625, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 5.42, "loss/crossentropy": 2.2508283853530884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2945093661546707, "step": 1768 }, { "epoch": 0.0354, "grad_norm": 2.828125, "grad_norm_var": 0.0222320556640625, "learning_rate": 0.0001, "loss": 5.2516, "loss/crossentropy": 1.9332409501075745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26304441690444946, "step": 1770 }, { "epoch": 0.03544, "grad_norm": 2.875, "grad_norm_var": 0.022557576497395832, "learning_rate": 0.0001, "loss": 5.1896, "loss/crossentropy": 2.143627643585205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28407415747642517, "step": 1772 }, { "epoch": 0.03548, "grad_norm": 2.859375, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 5.4763, "loss/crossentropy": 2.32085120677948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3176119029521942, "step": 1774 }, { "epoch": 0.03552, "grad_norm": 2.828125, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 5.1717, "loss/crossentropy": 1.7893801927566528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27407801151275635, "step": 1776 }, { "epoch": 0.03556, "grad_norm": 2.78125, "grad_norm_var": 0.027228800455729167, "learning_rate": 0.0001, "loss": 5.42, "loss/crossentropy": 2.206292986869812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3071517199277878, "step": 1778 }, { "epoch": 0.0356, "grad_norm": 2.84375, "grad_norm_var": 0.013263956705729166, "learning_rate": 0.0001, "loss": 5.1879, "loss/crossentropy": 2.1285043954849243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3164139539003372, "step": 1780 }, { "epoch": 0.03564, "grad_norm": 3.15625, "grad_norm_var": 0.018635050455729166, "learning_rate": 0.0001, "loss": 5.2826, "loss/crossentropy": 2.16570383310318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3052050769329071, "step": 1782 }, { "epoch": 0.03568, "grad_norm": 2.734375, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 5.0921, "loss/crossentropy": 1.9799941778182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655785381793976, "step": 1784 }, { "epoch": 0.03572, "grad_norm": 2.765625, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 5.2468, "loss/crossentropy": 1.9801498651504517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2824363112449646, "step": 1786 }, { "epoch": 0.03576, "grad_norm": 2.828125, "grad_norm_var": 0.020947265625, "learning_rate": 0.0001, "loss": 5.0131, "loss/crossentropy": 1.5805786848068237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25234321504831314, "step": 1788 }, { "epoch": 0.0358, "grad_norm": 3.15625, "grad_norm_var": 0.021484375, "learning_rate": 0.0001, "loss": 5.296, "loss/crossentropy": 2.2434048652648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2896551638841629, "step": 1790 }, { "epoch": 0.03584, "grad_norm": 2.78125, "grad_norm_var": 0.022411092122395834, "learning_rate": 0.0001, "loss": 5.0179, "loss/crossentropy": 1.9738762378692627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2959328889846802, "step": 1792 }, { "epoch": 0.03588, "grad_norm": 3.0625, "grad_norm_var": 0.0272857666015625, "learning_rate": 0.0001, "loss": 5.2317, "loss/crossentropy": 2.222583770751953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29915711283683777, "step": 1794 }, { "epoch": 0.03592, "grad_norm": 2.8125, "grad_norm_var": 0.032942708333333334, "learning_rate": 0.0001, "loss": 5.4192, "loss/crossentropy": 2.188909649848938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.294817179441452, "step": 1796 }, { "epoch": 0.03596, "grad_norm": 3.015625, "grad_norm_var": 0.030907185872395833, "learning_rate": 0.0001, "loss": 5.6303, "loss/crossentropy": 2.4745373725891113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32838208973407745, "step": 1798 }, { "epoch": 0.036, "grad_norm": 2.890625, "grad_norm_var": 0.028595987955729166, "learning_rate": 0.0001, "loss": 5.3466, "loss/crossentropy": 1.9314215779304504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29102426767349243, "step": 1800 }, { "epoch": 0.03604, "grad_norm": 2.796875, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 4.9782, "loss/crossentropy": 2.0099900364875793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30170293152332306, "step": 1802 }, { "epoch": 0.03608, "grad_norm": 2.859375, "grad_norm_var": 0.024388631184895832, "learning_rate": 0.0001, "loss": 5.2506, "loss/crossentropy": 2.1273564100265503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3019937574863434, "step": 1804 }, { "epoch": 0.03612, "grad_norm": 2.96875, "grad_norm_var": 0.016341145833333334, "learning_rate": 0.0001, "loss": 5.1003, "loss/crossentropy": 2.065160095691681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2817380279302597, "step": 1806 }, { "epoch": 0.03616, "grad_norm": 2.828125, "grad_norm_var": 0.018561808268229167, "learning_rate": 0.0001, "loss": 5.1569, "loss/crossentropy": 2.262821078300476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33245067298412323, "step": 1808 }, { "epoch": 0.0362, "grad_norm": 2.625, "grad_norm_var": 0.020003255208333334, "learning_rate": 0.0001, "loss": 4.8323, "loss/crossentropy": 2.164163827896118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26008155941963196, "step": 1810 }, { "epoch": 0.03624, "grad_norm": 2.78125, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 5.2517, "loss/crossentropy": 2.147629737854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3073730617761612, "step": 1812 }, { "epoch": 0.03628, "grad_norm": 2.75, "grad_norm_var": 0.023900349934895832, "learning_rate": 0.0001, "loss": 5.1255, "loss/crossentropy": 2.04233980178833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981158718466759, "step": 1814 }, { "epoch": 0.03632, "grad_norm": 3.0, "grad_norm_var": 0.026439412434895834, "learning_rate": 0.0001, "loss": 4.9465, "loss/crossentropy": 2.264409065246582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32124973833560944, "step": 1816 }, { "epoch": 0.03636, "grad_norm": 3.109375, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 5.4324, "loss/crossentropy": 2.092079997062683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3318764269351959, "step": 1818 }, { "epoch": 0.0364, "grad_norm": 2.828125, "grad_norm_var": 0.03850504557291667, "learning_rate": 0.0001, "loss": 5.2766, "loss/crossentropy": 2.1007314324378967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2930498272180557, "step": 1820 }, { "epoch": 0.03644, "grad_norm": 2.84375, "grad_norm_var": 0.03697001139322917, "learning_rate": 0.0001, "loss": 5.129, "loss/crossentropy": 2.2377375960350037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30473431944847107, "step": 1822 }, { "epoch": 0.03648, "grad_norm": 3.296875, "grad_norm_var": 0.04684244791666667, "learning_rate": 0.0001, "loss": 5.4209, "loss/crossentropy": 2.0965787172317505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30526305735111237, "step": 1824 }, { "epoch": 0.03652, "grad_norm": 3.078125, "grad_norm_var": 0.04755859375, "learning_rate": 0.0001, "loss": 5.0359, "loss/crossentropy": 2.208711862564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2727830111980438, "step": 1826 }, { "epoch": 0.03656, "grad_norm": 2.796875, "grad_norm_var": 0.050633748372395836, "learning_rate": 0.0001, "loss": 5.1692, "loss/crossentropy": 2.1706738471984863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3054092824459076, "step": 1828 }, { "epoch": 0.0366, "grad_norm": 2.796875, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 5.2311, "loss/crossentropy": 2.0891621112823486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3111531287431717, "step": 1830 }, { "epoch": 0.03664, "grad_norm": 2.859375, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 5.0537, "loss/crossentropy": 2.0721256732940674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30227208137512207, "step": 1832 }, { "epoch": 0.03668, "grad_norm": 3.015625, "grad_norm_var": 0.03023681640625, "learning_rate": 0.0001, "loss": 5.1267, "loss/crossentropy": 2.2015734910964966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3185647875070572, "step": 1834 }, { "epoch": 0.03672, "grad_norm": 3.03125, "grad_norm_var": 0.026493326822916666, "learning_rate": 0.0001, "loss": 5.1973, "loss/crossentropy": 2.1985132694244385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3032165467739105, "step": 1836 }, { "epoch": 0.03676, "grad_norm": 2.890625, "grad_norm_var": 0.026334635416666665, "learning_rate": 0.0001, "loss": 5.2041, "loss/crossentropy": 2.170067548751831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3295029550790787, "step": 1838 }, { "epoch": 0.0368, "grad_norm": 2.921875, "grad_norm_var": 0.014615885416666667, "learning_rate": 0.0001, "loss": 5.0049, "loss/crossentropy": 2.113592267036438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775915116071701, "step": 1840 }, { "epoch": 0.03684, "grad_norm": 2.78125, "grad_norm_var": 0.0117828369140625, "learning_rate": 0.0001, "loss": 5.4221, "loss/crossentropy": 2.1905024647712708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33297547698020935, "step": 1842 }, { "epoch": 0.03688, "grad_norm": 2.828125, "grad_norm_var": 0.012919108072916666, "learning_rate": 0.0001, "loss": 5.1859, "loss/crossentropy": 2.252369999885559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29375749826431274, "step": 1844 }, { "epoch": 0.03692, "grad_norm": 3.015625, "grad_norm_var": 0.020182291666666668, "learning_rate": 0.0001, "loss": 4.8942, "loss/crossentropy": 1.7526759505271912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2707225978374481, "step": 1846 }, { "epoch": 0.03696, "grad_norm": 3.21875, "grad_norm_var": 0.028880818684895834, "learning_rate": 0.0001, "loss": 5.6529, "loss/crossentropy": 2.592544913291931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3520146906375885, "step": 1848 }, { "epoch": 0.037, "grad_norm": 2.8125, "grad_norm_var": 0.0278961181640625, "learning_rate": 0.0001, "loss": 4.9816, "loss/crossentropy": 1.8699345588684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2852860391139984, "step": 1850 }, { "epoch": 0.03704, "grad_norm": 2.640625, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 5.1326, "loss/crossentropy": 2.2219313383102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3210798054933548, "step": 1852 }, { "epoch": 0.03708, "grad_norm": 2.796875, "grad_norm_var": 0.0327056884765625, "learning_rate": 0.0001, "loss": 5.2301, "loss/crossentropy": 1.9926818013191223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26928839832544327, "step": 1854 }, { "epoch": 0.03712, "grad_norm": 2.828125, "grad_norm_var": 0.032515462239583334, "learning_rate": 0.0001, "loss": 5.0372, "loss/crossentropy": 2.019917130470276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28046920895576477, "step": 1856 }, { "epoch": 0.03716, "grad_norm": 4.84375, "grad_norm_var": 0.2762858072916667, "learning_rate": 0.0001, "loss": 5.6297, "loss/crossentropy": 2.2585690021514893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30598941445350647, "step": 1858 }, { "epoch": 0.0372, "grad_norm": 2.984375, "grad_norm_var": 0.27327067057291665, "learning_rate": 0.0001, "loss": 5.3508, "loss/crossentropy": 2.298324942588806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3494870364665985, "step": 1860 }, { "epoch": 0.03724, "grad_norm": 3.125, "grad_norm_var": 0.25745035807291666, "learning_rate": 0.0001, "loss": 5.54, "loss/crossentropy": 2.430496573448181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3227449208498001, "step": 1862 }, { "epoch": 0.03728, "grad_norm": 2.953125, "grad_norm_var": 0.26183980305989585, "learning_rate": 0.0001, "loss": 5.1876, "loss/crossentropy": 2.090576171875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2790074646472931, "step": 1864 }, { "epoch": 0.03732, "grad_norm": 3.0625, "grad_norm_var": 0.25806884765625, "learning_rate": 0.0001, "loss": 5.1799, "loss/crossentropy": 2.2794109582901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3135898858308792, "step": 1866 }, { "epoch": 0.03736, "grad_norm": 2.640625, "grad_norm_var": 0.2597076416015625, "learning_rate": 0.0001, "loss": 4.9333, "loss/crossentropy": 2.2433481216430664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27558377385139465, "step": 1868 }, { "epoch": 0.0374, "grad_norm": 2.59375, "grad_norm_var": 0.26806233723958334, "learning_rate": 0.0001, "loss": 5.292, "loss/crossentropy": 2.3111730813980103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3172578364610672, "step": 1870 }, { "epoch": 0.03744, "grad_norm": 2.921875, "grad_norm_var": 0.270654296875, "learning_rate": 0.0001, "loss": 5.2364, "loss/crossentropy": 2.0028095841407776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35644619166851044, "step": 1872 }, { "epoch": 0.03748, "grad_norm": 2.828125, "grad_norm_var": 0.04501546223958333, "learning_rate": 0.0001, "loss": 5.2183, "loss/crossentropy": 2.347644329071045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3119680881500244, "step": 1874 }, { "epoch": 0.03752, "grad_norm": 2.65625, "grad_norm_var": 0.03474833170572917, "learning_rate": 0.0001, "loss": 5.0787, "loss/crossentropy": 2.118954062461853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28975334763526917, "step": 1876 }, { "epoch": 0.03756, "grad_norm": 3.0625, "grad_norm_var": 0.033568318684895834, "learning_rate": 0.0001, "loss": 5.6649, "loss/crossentropy": 2.4376548528671265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3358190506696701, "step": 1878 }, { "epoch": 0.0376, "grad_norm": 3.28125, "grad_norm_var": 0.042724609375, "learning_rate": 0.0001, "loss": 5.4924, "loss/crossentropy": 2.5907636880874634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3234570771455765, "step": 1880 }, { "epoch": 0.03764, "grad_norm": 2.859375, "grad_norm_var": 0.04352925618489583, "learning_rate": 0.0001, "loss": 5.0298, "loss/crossentropy": 1.574956238269806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26118964701890945, "step": 1882 }, { "epoch": 0.03768, "grad_norm": 3.203125, "grad_norm_var": 0.04599609375, "learning_rate": 0.0001, "loss": 5.3339, "loss/crossentropy": 2.3571736812591553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.308853879570961, "step": 1884 }, { "epoch": 0.03772, "grad_norm": 2.546875, "grad_norm_var": 0.03877665201822917, "learning_rate": 0.0001, "loss": 5.0807, "loss/crossentropy": 2.1266958117485046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.307782918214798, "step": 1886 }, { "epoch": 0.03776, "grad_norm": 3.0, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 5.5402, "loss/crossentropy": 2.3529077768325806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32287272810935974, "step": 1888 }, { "epoch": 0.0378, "grad_norm": 3.015625, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 5.4555, "loss/crossentropy": 2.277345299720764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32319171726703644, "step": 1890 }, { "epoch": 0.03784, "grad_norm": 2.859375, "grad_norm_var": 0.035008748372395836, "learning_rate": 0.0001, "loss": 5.01, "loss/crossentropy": 2.102940857410431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.292771652340889, "step": 1892 }, { "epoch": 0.03788, "grad_norm": 2.640625, "grad_norm_var": 0.03998921712239583, "learning_rate": 0.0001, "loss": 5.0784, "loss/crossentropy": 1.9744033813476562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2844446450471878, "step": 1894 }, { "epoch": 0.03792, "grad_norm": 2.625, "grad_norm_var": 0.03250325520833333, "learning_rate": 0.0001, "loss": 4.9859, "loss/crossentropy": 1.8222022652626038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28611525893211365, "step": 1896 }, { "epoch": 0.03796, "grad_norm": 2.96875, "grad_norm_var": 0.03135477701822917, "learning_rate": 0.0001, "loss": 5.0704, "loss/crossentropy": 2.1963966488838196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2988738566637039, "step": 1898 }, { "epoch": 0.038, "grad_norm": 3.046875, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 5.1155, "loss/crossentropy": 1.9982789754867554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.312205046415329, "step": 1900 }, { "epoch": 0.03804, "grad_norm": 2.75, "grad_norm_var": 0.020612589518229165, "learning_rate": 0.0001, "loss": 5.2097, "loss/crossentropy": 2.1999258995056152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29916079342365265, "step": 1902 }, { "epoch": 0.03808, "grad_norm": 2.859375, "grad_norm_var": 0.016999308268229166, "learning_rate": 0.0001, "loss": 5.2233, "loss/crossentropy": 2.0725532174110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3093830645084381, "step": 1904 }, { "epoch": 0.03812, "grad_norm": 3.03125, "grad_norm_var": 0.027079264322916668, "learning_rate": 0.0001, "loss": 5.7014, "loss/crossentropy": 2.2504276037216187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.332836389541626, "step": 1906 }, { "epoch": 0.03816, "grad_norm": 2.875, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 5.3413, "loss/crossentropy": 2.1570577025413513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3210139572620392, "step": 1908 }, { "epoch": 0.0382, "grad_norm": 3.59375, "grad_norm_var": 0.05137430826822917, "learning_rate": 0.0001, "loss": 5.6215, "loss/crossentropy": 2.0739041566848755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091907352209091, "step": 1910 }, { "epoch": 0.03824, "grad_norm": 2.703125, "grad_norm_var": 0.0502838134765625, "learning_rate": 0.0001, "loss": 5.0772, "loss/crossentropy": 2.0542168021202087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2791624963283539, "step": 1912 }, { "epoch": 0.03828, "grad_norm": 2.65625, "grad_norm_var": 0.0561431884765625, "learning_rate": 0.0001, "loss": 4.9386, "loss/crossentropy": 1.9705287218093872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25483617186546326, "step": 1914 }, { "epoch": 0.03832, "grad_norm": 2.875, "grad_norm_var": 0.05819905598958333, "learning_rate": 0.0001, "loss": 5.054, "loss/crossentropy": 2.0234111547470093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3110152333974838, "step": 1916 }, { "epoch": 0.03836, "grad_norm": 3.15625, "grad_norm_var": 0.06004130045572917, "learning_rate": 0.0001, "loss": 5.2723, "loss/crossentropy": 2.010735273361206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2943515181541443, "step": 1918 }, { "epoch": 0.0384, "grad_norm": 2.734375, "grad_norm_var": 0.06575419108072916, "learning_rate": 0.0001, "loss": 4.9471, "loss/crossentropy": 2.1912686824798584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27914971113204956, "step": 1920 }, { "epoch": 0.03844, "grad_norm": 2.421875, "grad_norm_var": 0.0709869384765625, "learning_rate": 0.0001, "loss": 5.085, "loss/crossentropy": 1.9889940023422241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26166442036628723, "step": 1922 }, { "epoch": 0.03848, "grad_norm": 2.96875, "grad_norm_var": 0.07668863932291667, "learning_rate": 0.0001, "loss": 5.3534, "loss/crossentropy": 2.154898941516876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31905338168144226, "step": 1924 }, { "epoch": 0.03852, "grad_norm": 2.796875, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 5.0847, "loss/crossentropy": 2.44269061088562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28736811876296997, "step": 1926 }, { "epoch": 0.03856, "grad_norm": 2.6875, "grad_norm_var": 0.04108784993489583, "learning_rate": 0.0001, "loss": 5.0585, "loss/crossentropy": 1.6790328621864319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32607489824295044, "step": 1928 }, { "epoch": 0.0386, "grad_norm": 3.046875, "grad_norm_var": 0.05115559895833333, "learning_rate": 0.0001, "loss": 5.336, "loss/crossentropy": 2.0223641991615295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2871186435222626, "step": 1930 }, { "epoch": 0.03864, "grad_norm": 2.8125, "grad_norm_var": 0.05756734212239583, "learning_rate": 0.0001, "loss": 5.549, "loss/crossentropy": 2.451051712036133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33973076939582825, "step": 1932 }, { "epoch": 0.03868, "grad_norm": 2.765625, "grad_norm_var": 0.052262369791666666, "learning_rate": 0.0001, "loss": 5.4403, "loss/crossentropy": 2.2884862422943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29729554057121277, "step": 1934 }, { "epoch": 0.03872, "grad_norm": 2.890625, "grad_norm_var": 0.04889322916666667, "learning_rate": 0.0001, "loss": 5.6203, "loss/crossentropy": 2.113345444202423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.36945630609989166, "step": 1936 }, { "epoch": 0.03876, "grad_norm": 2.71875, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 5.1845, "loss/crossentropy": 2.139409840106964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2899101823568344, "step": 1938 }, { "epoch": 0.0388, "grad_norm": 2.953125, "grad_norm_var": 0.03622639973958333, "learning_rate": 0.0001, "loss": 5.5397, "loss/crossentropy": 2.1029305458068848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29830390214920044, "step": 1940 }, { "epoch": 0.03884, "grad_norm": 2.734375, "grad_norm_var": 0.03816630045572917, "learning_rate": 0.0001, "loss": 5.1383, "loss/crossentropy": 1.7736502885818481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27648696303367615, "step": 1942 }, { "epoch": 0.03888, "grad_norm": 2.671875, "grad_norm_var": 0.03942057291666667, "learning_rate": 0.0001, "loss": 5.0885, "loss/crossentropy": 2.0281469225883484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30329641699790955, "step": 1944 }, { "epoch": 0.03892, "grad_norm": 2.890625, "grad_norm_var": 0.03528544108072917, "learning_rate": 0.0001, "loss": 5.3265, "loss/crossentropy": 2.404891610145569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3563212752342224, "step": 1946 }, { "epoch": 0.03896, "grad_norm": 2.53125, "grad_norm_var": 0.03572489420572917, "learning_rate": 0.0001, "loss": 5.0657, "loss/crossentropy": 2.2187922596931458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2999056503176689, "step": 1948 }, { "epoch": 0.039, "grad_norm": 2.875, "grad_norm_var": 0.03566792805989583, "learning_rate": 0.0001, "loss": 5.0499, "loss/crossentropy": 2.3901994228363037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3288661539554596, "step": 1950 }, { "epoch": 0.03904, "grad_norm": 3.15625, "grad_norm_var": 0.04069722493489583, "learning_rate": 0.0001, "loss": 5.252, "loss/crossentropy": 2.274617314338684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2928028404712677, "step": 1952 }, { "epoch": 0.03908, "grad_norm": 2.859375, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 5.4463, "loss/crossentropy": 2.2478950023651123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3027127981185913, "step": 1954 }, { "epoch": 0.03912, "grad_norm": 2.78125, "grad_norm_var": 0.030321248372395835, "learning_rate": 0.0001, "loss": 5.2707, "loss/crossentropy": 2.0634876489639282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29374830424785614, "step": 1956 }, { "epoch": 0.03916, "grad_norm": 2.90625, "grad_norm_var": 0.0287994384765625, "learning_rate": 0.0001, "loss": 5.3441, "loss/crossentropy": 2.171326994895935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2995557487010956, "step": 1958 }, { "epoch": 0.0392, "grad_norm": 2.515625, "grad_norm_var": 0.040266927083333334, "learning_rate": 0.0001, "loss": 4.9937, "loss/crossentropy": 2.142563223838806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27376608550548553, "step": 1960 }, { "epoch": 0.03924, "grad_norm": 5.34375, "grad_norm_var": 0.42568359375, "learning_rate": 0.0001, "loss": 5.4145, "loss/crossentropy": 2.3189245462417603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.37109237909317017, "step": 1962 }, { "epoch": 0.03928, "grad_norm": 3.203125, "grad_norm_var": 0.40812886555989586, "learning_rate": 0.0001, "loss": 5.1826, "loss/crossentropy": 2.1483139991760254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32499393820762634, "step": 1964 }, { "epoch": 0.03932, "grad_norm": 3.046875, "grad_norm_var": 0.404736328125, "learning_rate": 0.0001, "loss": 5.445, "loss/crossentropy": 2.2916383743286133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29367291927337646, "step": 1966 }, { "epoch": 0.03936, "grad_norm": 2.9375, "grad_norm_var": 0.3999582926432292, "learning_rate": 0.0001, "loss": 5.1663, "loss/crossentropy": 2.4217371940612793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31309331953525543, "step": 1968 }, { "epoch": 0.0394, "grad_norm": 2.9375, "grad_norm_var": 0.40103759765625, "learning_rate": 0.0001, "loss": 5.2057, "loss/crossentropy": 1.9491975903511047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29921913146972656, "step": 1970 }, { "epoch": 0.03944, "grad_norm": 2.78125, "grad_norm_var": 0.39126688639322915, "learning_rate": 0.0001, "loss": 5.1788, "loss/crossentropy": 2.144432306289673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29162150621414185, "step": 1972 }, { "epoch": 0.03948, "grad_norm": 2.765625, "grad_norm_var": 0.40748291015625, "learning_rate": 0.0001, "loss": 5.1336, "loss/crossentropy": 1.9492529034614563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2875414192676544, "step": 1974 }, { "epoch": 0.03952, "grad_norm": 2.75, "grad_norm_var": 0.4002593994140625, "learning_rate": 0.0001, "loss": 5.053, "loss/crossentropy": 1.9269813895225525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2755381464958191, "step": 1976 }, { "epoch": 0.03956, "grad_norm": 2.828125, "grad_norm_var": 0.03798726399739583, "learning_rate": 0.0001, "loss": 4.8879, "loss/crossentropy": 2.074360489845276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2987503558397293, "step": 1978 }, { "epoch": 0.0396, "grad_norm": 2.953125, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 4.8834, "loss/crossentropy": 2.257633090019226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2992274910211563, "step": 1980 }, { "epoch": 0.03964, "grad_norm": 2.75, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 4.9533, "loss/crossentropy": 1.8207083940505981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752758115530014, "step": 1982 }, { "epoch": 0.03968, "grad_norm": 2.875, "grad_norm_var": 0.015623982747395833, "learning_rate": 0.0001, "loss": 5.343, "loss/crossentropy": 2.105292797088623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3131226450204849, "step": 1984 }, { "epoch": 0.03972, "grad_norm": 3.484375, "grad_norm_var": 0.05191650390625, "learning_rate": 0.0001, "loss": 5.4785, "loss/crossentropy": 2.1191373467445374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3175530731678009, "step": 1986 }, { "epoch": 0.03976, "grad_norm": 2.75, "grad_norm_var": 0.051878865559895834, "learning_rate": 0.0001, "loss": 4.9236, "loss/crossentropy": 2.2214397192001343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3091724067926407, "step": 1988 }, { "epoch": 0.0398, "grad_norm": 2.875, "grad_norm_var": 0.05133056640625, "learning_rate": 0.0001, "loss": 5.0031, "loss/crossentropy": 1.7347424626350403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2825440764427185, "step": 1990 }, { "epoch": 0.03984, "grad_norm": 2.890625, "grad_norm_var": 0.0500396728515625, "learning_rate": 0.0001, "loss": 5.0951, "loss/crossentropy": 2.1566559076309204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2961925268173218, "step": 1992 }, { "epoch": 0.03988, "grad_norm": 2.703125, "grad_norm_var": 0.06396077473958334, "learning_rate": 0.0001, "loss": 4.9195, "loss/crossentropy": 2.2129205465316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29672613739967346, "step": 1994 }, { "epoch": 0.03992, "grad_norm": 3.3125, "grad_norm_var": 0.07595113118489584, "learning_rate": 0.0001, "loss": 5.7534, "loss/crossentropy": 2.4702744483947754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3707122802734375, "step": 1996 }, { "epoch": 0.03996, "grad_norm": 2.84375, "grad_norm_var": 0.06731669108072917, "learning_rate": 0.0001, "loss": 5.0296, "loss/crossentropy": 2.0463536977767944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3124735355377197, "step": 1998 }, { "epoch": 0.04, "grad_norm": 2.796875, "grad_norm_var": 0.07281494140625, "learning_rate": 0.0001, "loss": 4.959, "loss/crossentropy": 2.1550235748291016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32177163660526276, "step": 2000 }, { "epoch": 0.04004, "grad_norm": 2.71875, "grad_norm_var": 0.059798177083333334, "learning_rate": 0.0001, "loss": 5.2078, "loss/crossentropy": 2.1312190890312195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29013920575380325, "step": 2002 }, { "epoch": 0.04008, "grad_norm": 2.75, "grad_norm_var": 0.06082356770833333, "learning_rate": 0.0001, "loss": 5.0086, "loss/crossentropy": 1.8546085357666016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2619543671607971, "step": 2004 }, { "epoch": 0.04012, "grad_norm": 3.015625, "grad_norm_var": 0.0561920166015625, "learning_rate": 0.0001, "loss": 5.3416, "loss/crossentropy": 2.262398660182953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28742220997810364, "step": 2006 }, { "epoch": 0.04016, "grad_norm": 3.109375, "grad_norm_var": 0.05734049479166667, "learning_rate": 0.0001, "loss": 5.4315, "loss/crossentropy": 2.156043767929077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30488699674606323, "step": 2008 }, { "epoch": 0.0402, "grad_norm": 2.765625, "grad_norm_var": 0.04290364583333333, "learning_rate": 0.0001, "loss": 4.8761, "loss/crossentropy": 1.925516963005066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2795708477497101, "step": 2010 }, { "epoch": 0.04024, "grad_norm": 2.6875, "grad_norm_var": 0.031281534830729166, "learning_rate": 0.0001, "loss": 5.0729, "loss/crossentropy": 1.947714388370514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27192793786525726, "step": 2012 }, { "epoch": 0.04028, "grad_norm": 2.625, "grad_norm_var": 0.038939412434895834, "learning_rate": 0.0001, "loss": 4.6214, "loss/crossentropy": 1.9584010243415833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835424840450287, "step": 2014 }, { "epoch": 0.04032, "grad_norm": 2.578125, "grad_norm_var": 0.04108784993489583, "learning_rate": 0.0001, "loss": 5.1974, "loss/crossentropy": 2.461808919906616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3104698956012726, "step": 2016 }, { "epoch": 0.04036, "grad_norm": 2.734375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 5.286, "loss/crossentropy": 2.094545900821686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30010756850242615, "step": 2018 }, { "epoch": 0.0404, "grad_norm": 2.78125, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 5.1274, "loss/crossentropy": 2.353589177131653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29775144159793854, "step": 2020 }, { "epoch": 0.04044, "grad_norm": 3.375, "grad_norm_var": 10.55537821451823, "learning_rate": 0.0001, "loss": 5.3359, "loss/crossentropy": 2.4468252658843994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.34534354507923126, "step": 2022 }, { "epoch": 0.04048, "grad_norm": 2.984375, "grad_norm_var": 10.529678344726562, "learning_rate": 0.0001, "loss": 5.5028, "loss/crossentropy": 2.2037755250930786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.303710475564003, "step": 2024 }, { "epoch": 0.04052, "grad_norm": 2.71875, "grad_norm_var": 10.546240234375, "learning_rate": 0.0001, "loss": 4.9229, "loss/crossentropy": 1.9658318161964417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2829178273677826, "step": 2026 }, { "epoch": 0.04056, "grad_norm": 2.734375, "grad_norm_var": 10.555729166666667, "learning_rate": 0.0001, "loss": 4.8996, "loss/crossentropy": 2.118351697921753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29469528794288635, "step": 2028 }, { "epoch": 0.0406, "grad_norm": 2.875, "grad_norm_var": 10.507957967122396, "learning_rate": 0.0001, "loss": 5.5817, "loss/crossentropy": 2.172826111316681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35036201775074005, "step": 2030 }, { "epoch": 0.04064, "grad_norm": 3.953125, "grad_norm_var": 10.432124837239583, "learning_rate": 0.0001, "loss": 5.4846, "loss/crossentropy": 2.185975730419159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3397497236728668, "step": 2032 }, { "epoch": 0.04068, "grad_norm": 2.78125, "grad_norm_var": 10.411026000976562, "learning_rate": 0.0001, "loss": 5.0145, "loss/crossentropy": 2.043874442577362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3034070134162903, "step": 2034 }, { "epoch": 0.04072, "grad_norm": 2.984375, "grad_norm_var": 10.380106608072916, "learning_rate": 0.0001, "loss": 5.3958, "loss/crossentropy": 2.3315287828445435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32198067009449005, "step": 2036 }, { "epoch": 0.04076, "grad_norm": 2.828125, "grad_norm_var": 0.1112213134765625, "learning_rate": 0.0001, "loss": 5.0266, "loss/crossentropy": 2.05656898021698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30919137597084045, "step": 2038 }, { "epoch": 0.0408, "grad_norm": 2.578125, "grad_norm_var": 0.10188395182291667, "learning_rate": 0.0001, "loss": 5.3205, "loss/crossentropy": 2.2451635599136353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3412973880767822, "step": 2040 }, { "epoch": 0.04084, "grad_norm": 3.1875, "grad_norm_var": 0.10715738932291667, "learning_rate": 0.0001, "loss": 5.1734, "loss/crossentropy": 2.527924060821533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3178148865699768, "step": 2042 }, { "epoch": 0.04088, "grad_norm": 2.78125, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 4.8441, "loss/crossentropy": 2.03126460313797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29549214243888855, "step": 2044 }, { "epoch": 0.04092, "grad_norm": 2.515625, "grad_norm_var": 0.11038004557291667, "learning_rate": 0.0001, "loss": 5.0134, "loss/crossentropy": 2.029997706413269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27938202023506165, "step": 2046 }, { "epoch": 0.04096, "grad_norm": 2.703125, "grad_norm_var": 0.03211263020833333, "learning_rate": 0.0001, "loss": 4.9321, "loss/crossentropy": 1.764098048210144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27712512016296387, "step": 2048 }, { "epoch": 0.041, "grad_norm": 3.046875, "grad_norm_var": 0.0356597900390625, "learning_rate": 0.0001, "loss": 5.435, "loss/crossentropy": 2.605324864387512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3540754020214081, "step": 2050 }, { "epoch": 0.04104, "grad_norm": 2.875, "grad_norm_var": 0.0349761962890625, "learning_rate": 0.0001, "loss": 5.0207, "loss/crossentropy": 1.9333613514900208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29670488089323044, "step": 2052 }, { "epoch": 0.04108, "grad_norm": 3.078125, "grad_norm_var": 0.040022786458333334, "learning_rate": 0.0001, "loss": 5.0056, "loss/crossentropy": 1.7876797914505005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2579014301300049, "step": 2054 }, { "epoch": 0.04112, "grad_norm": 3.125, "grad_norm_var": 0.0417877197265625, "learning_rate": 0.0001, "loss": 5.1401, "loss/crossentropy": 2.0947588682174683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3334304690361023, "step": 2056 }, { "epoch": 0.04116, "grad_norm": 3.0, "grad_norm_var": 0.035965983072916666, "learning_rate": 0.0001, "loss": 4.9009, "loss/crossentropy": 2.1838767528533936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.306907519698143, "step": 2058 }, { "epoch": 0.0412, "grad_norm": 3.015625, "grad_norm_var": 0.0341461181640625, "learning_rate": 0.0001, "loss": 5.3724, "loss/crossentropy": 2.2180997133255005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.314627081155777, "step": 2060 }, { "epoch": 0.04124, "grad_norm": 2.875, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 4.8362, "loss/crossentropy": 1.914646863937378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2848198413848877, "step": 2062 }, { "epoch": 0.04128, "grad_norm": 2.609375, "grad_norm_var": 0.0262603759765625, "learning_rate": 0.0001, "loss": 5.4538, "loss/crossentropy": 2.42458713054657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31268230080604553, "step": 2064 }, { "epoch": 0.04132, "grad_norm": 2.828125, "grad_norm_var": 0.02457275390625, "learning_rate": 0.0001, "loss": 5.2497, "loss/crossentropy": 2.23202121257782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30988384783267975, "step": 2066 }, { "epoch": 0.04136, "grad_norm": 2.5625, "grad_norm_var": 0.0284820556640625, "learning_rate": 0.0001, "loss": 5.0416, "loss/crossentropy": 2.0225483179092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26953594386577606, "step": 2068 }, { "epoch": 0.0414, "grad_norm": 2.875, "grad_norm_var": 0.028669230143229165, "learning_rate": 0.0001, "loss": 4.8259, "loss/crossentropy": 1.8593338131904602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27706706523895264, "step": 2070 }, { "epoch": 0.04144, "grad_norm": 2.59375, "grad_norm_var": 0.023387654622395834, "learning_rate": 0.0001, "loss": 4.9949, "loss/crossentropy": 2.373727560043335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3000074476003647, "step": 2072 }, { "epoch": 0.04148, "grad_norm": 2.578125, "grad_norm_var": 0.022704060872395834, "learning_rate": 0.0001, "loss": 4.9438, "loss/crossentropy": 1.959564983844757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26365046203136444, "step": 2074 }, { "epoch": 0.04152, "grad_norm": 2.921875, "grad_norm_var": 0.014159138997395833, "learning_rate": 0.0001, "loss": 4.983, "loss/crossentropy": 2.0590676069259644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2816864550113678, "step": 2076 }, { "epoch": 0.04156, "grad_norm": 2.84375, "grad_norm_var": 0.017121378580729166, "learning_rate": 0.0001, "loss": 5.2049, "loss/crossentropy": 2.147680163383484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29019051790237427, "step": 2078 }, { "epoch": 0.0416, "grad_norm": 2.609375, "grad_norm_var": 0.020198567708333334, "learning_rate": 0.0001, "loss": 5.6103, "loss/crossentropy": 2.212267220020294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2804659754037857, "step": 2080 }, { "epoch": 0.04164, "grad_norm": 2.65625, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 5.0489, "loss/crossentropy": 2.144743025302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2946523129940033, "step": 2082 }, { "epoch": 0.04168, "grad_norm": 2.703125, "grad_norm_var": 0.019172159830729167, "learning_rate": 0.0001, "loss": 4.9311, "loss/crossentropy": 2.3702481985092163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2923210561275482, "step": 2084 }, { "epoch": 0.04172, "grad_norm": 2.96875, "grad_norm_var": 0.019684855143229166, "learning_rate": 0.0001, "loss": 5.2291, "loss/crossentropy": 1.9512975811958313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28704553842544556, "step": 2086 }, { "epoch": 0.04176, "grad_norm": 3.25, "grad_norm_var": 0.25388895670572914, "learning_rate": 0.0001, "loss": 4.9572, "loss/crossentropy": 2.180745005607605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28361976146698, "step": 2088 }, { "epoch": 0.0418, "grad_norm": 2.671875, "grad_norm_var": 0.24712626139322916, "learning_rate": 0.0001, "loss": 4.8579, "loss/crossentropy": 1.9768275022506714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28042787313461304, "step": 2090 }, { "epoch": 0.04184, "grad_norm": 2.90625, "grad_norm_var": 0.24544270833333334, "learning_rate": 0.0001, "loss": 5.2602, "loss/crossentropy": 2.148472547531128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30099035799503326, "step": 2092 }, { "epoch": 0.04188, "grad_norm": 2.734375, "grad_norm_var": 0.24763081868489584, "learning_rate": 0.0001, "loss": 5.0152, "loss/crossentropy": 2.1698715686798096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3140450567007065, "step": 2094 }, { "epoch": 0.04192, "grad_norm": 2.828125, "grad_norm_var": 0.24172770182291667, "learning_rate": 0.0001, "loss": 4.8679, "loss/crossentropy": 2.1142334938049316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2847675681114197, "step": 2096 }, { "epoch": 0.04196, "grad_norm": 2.703125, "grad_norm_var": 0.2395660400390625, "learning_rate": 0.0001, "loss": 5.2185, "loss/crossentropy": 2.1908479928970337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28702451288700104, "step": 2098 }, { "epoch": 0.042, "grad_norm": 2.6875, "grad_norm_var": 0.23321024576822916, "learning_rate": 0.0001, "loss": 5.0212, "loss/crossentropy": 2.0519612431526184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29224735498428345, "step": 2100 }, { "epoch": 0.04204, "grad_norm": 2.96875, "grad_norm_var": 0.2412994384765625, "learning_rate": 0.0001, "loss": 4.871, "loss/crossentropy": 1.9304961562156677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28785137832164764, "step": 2102 }, { "epoch": 0.04208, "grad_norm": 2.765625, "grad_norm_var": 0.0134185791015625, "learning_rate": 0.0001, "loss": 5.2462, "loss/crossentropy": 2.297300934791565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.302143856883049, "step": 2104 }, { "epoch": 0.04212, "grad_norm": 2.453125, "grad_norm_var": 0.019782511393229167, "learning_rate": 0.0001, "loss": 5.0491, "loss/crossentropy": 2.2764381170272827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28773219883441925, "step": 2106 }, { "epoch": 0.04216, "grad_norm": 2.625, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 5.0563, "loss/crossentropy": 2.141321837902069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3108212947845459, "step": 2108 }, { "epoch": 0.0422, "grad_norm": 2.71875, "grad_norm_var": 0.0185699462890625, "learning_rate": 0.0001, "loss": 5.0362, "loss/crossentropy": 1.9619495272636414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2938811331987381, "step": 2110 }, { "epoch": 0.04224, "grad_norm": 3.015625, "grad_norm_var": 0.026325480143229166, "learning_rate": 0.0001, "loss": 5.4496, "loss/crossentropy": 1.9741051197052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28835102915763855, "step": 2112 }, { "epoch": 0.04228, "grad_norm": 2.546875, "grad_norm_var": 0.029255167643229166, "learning_rate": 0.0001, "loss": 4.9303, "loss/crossentropy": 1.9510936737060547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2839510589838028, "step": 2114 }, { "epoch": 0.04232, "grad_norm": 2.828125, "grad_norm_var": 0.026753743489583332, "learning_rate": 0.0001, "loss": 5.2446, "loss/crossentropy": 2.0201885104179382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30627067387104034, "step": 2116 }, { "epoch": 0.04236, "grad_norm": 2.6875, "grad_norm_var": 0.022493489583333335, "learning_rate": 0.0001, "loss": 5.1411, "loss/crossentropy": 2.4522262811660767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.308579683303833, "step": 2118 }, { "epoch": 0.0424, "grad_norm": 3.09375, "grad_norm_var": 0.04345296223958333, "learning_rate": 0.0001, "loss": 5.5535, "loss/crossentropy": 1.9289590120315552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29346515238285065, "step": 2120 }, { "epoch": 0.04244, "grad_norm": 2.8125, "grad_norm_var": 0.03437398274739583, "learning_rate": 0.0001, "loss": 5.0588, "loss/crossentropy": 2.2020061016082764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2992282509803772, "step": 2122 }, { "epoch": 0.04248, "grad_norm": 2.78125, "grad_norm_var": 0.031494140625, "learning_rate": 0.0001, "loss": 5.2466, "loss/crossentropy": 2.180301785469055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28323256969451904, "step": 2124 }, { "epoch": 0.04252, "grad_norm": 2.78125, "grad_norm_var": 0.03435872395833333, "learning_rate": 0.0001, "loss": 4.9061, "loss/crossentropy": 2.1250513792037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.272533118724823, "step": 2126 }, { "epoch": 0.04256, "grad_norm": 2.78125, "grad_norm_var": 0.037873331705729166, "learning_rate": 0.0001, "loss": 5.4375, "loss/crossentropy": 2.3509981632232666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3719516545534134, "step": 2128 }, { "epoch": 0.0426, "grad_norm": 3.078125, "grad_norm_var": 0.03508707682291667, "learning_rate": 0.0001, "loss": 5.3067, "loss/crossentropy": 2.135426163673401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3582882583141327, "step": 2130 }, { "epoch": 0.04264, "grad_norm": 2.578125, "grad_norm_var": 0.0398590087890625, "learning_rate": 0.0001, "loss": 5.2406, "loss/crossentropy": 2.316452383995056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30162203311920166, "step": 2132 }, { "epoch": 0.04268, "grad_norm": 2.765625, "grad_norm_var": 0.03846028645833333, "learning_rate": 0.0001, "loss": 5.0372, "loss/crossentropy": 2.0325432419776917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.287256121635437, "step": 2134 }, { "epoch": 0.04272, "grad_norm": 2.6875, "grad_norm_var": 0.0236236572265625, "learning_rate": 0.0001, "loss": 5.1985, "loss/crossentropy": 2.070056974887848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2552843391895294, "step": 2136 }, { "epoch": 0.04276, "grad_norm": 3.078125, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 5.0623, "loss/crossentropy": 1.7005944848060608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24984879791736603, "step": 2138 }, { "epoch": 0.0428, "grad_norm": 2.75, "grad_norm_var": 0.03203125, "learning_rate": 0.0001, "loss": 5.0862, "loss/crossentropy": 1.6700931787490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2572527676820755, "step": 2140 }, { "epoch": 0.04284, "grad_norm": 2.65625, "grad_norm_var": 0.03125, "learning_rate": 0.0001, "loss": 5.0186, "loss/crossentropy": 2.3074774742126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31137382984161377, "step": 2142 }, { "epoch": 0.04288, "grad_norm": 2.84375, "grad_norm_var": 0.0229400634765625, "learning_rate": 0.0001, "loss": 5.1973, "loss/crossentropy": 2.103408098220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30157215893268585, "step": 2144 }, { "epoch": 0.04292, "grad_norm": 2.796875, "grad_norm_var": 0.020979817708333334, "learning_rate": 0.0001, "loss": 4.8206, "loss/crossentropy": 1.8602584600448608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28300634026527405, "step": 2146 }, { "epoch": 0.04296, "grad_norm": 2.671875, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 5.0525, "loss/crossentropy": 2.337582588195801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28086431324481964, "step": 2148 }, { "epoch": 0.043, "grad_norm": 2.796875, "grad_norm_var": 0.026439412434895834, "learning_rate": 0.0001, "loss": 5.2405, "loss/crossentropy": 2.2635254859924316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3113311231136322, "step": 2150 }, { "epoch": 0.04304, "grad_norm": 2.96875, "grad_norm_var": 0.0277984619140625, "learning_rate": 0.0001, "loss": 5.187, "loss/crossentropy": 2.3971948623657227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32285284996032715, "step": 2152 }, { "epoch": 0.04308, "grad_norm": 2.875, "grad_norm_var": 0.021891276041666668, "learning_rate": 0.0001, "loss": 5.1438, "loss/crossentropy": 1.8900776505470276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28163351118564606, "step": 2154 }, { "epoch": 0.04312, "grad_norm": 2.765625, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 4.9774, "loss/crossentropy": 1.908443808555603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2901918590068817, "step": 2156 }, { "epoch": 0.04316, "grad_norm": 2.75, "grad_norm_var": 0.020409138997395833, "learning_rate": 0.0001, "loss": 4.7808, "loss/crossentropy": 1.8003268837928772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2762569487094879, "step": 2158 }, { "epoch": 0.0432, "grad_norm": 2.640625, "grad_norm_var": 0.0221832275390625, "learning_rate": 0.0001, "loss": 5.0477, "loss/crossentropy": 1.996739387512207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24626458436250687, "step": 2160 }, { "epoch": 0.04324, "grad_norm": 2.84375, "grad_norm_var": 0.018880208333333332, "learning_rate": 0.0001, "loss": 5.1737, "loss/crossentropy": 2.0175461173057556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31059183180332184, "step": 2162 }, { "epoch": 0.04328, "grad_norm": 2.6875, "grad_norm_var": 0.019266764322916668, "learning_rate": 0.0001, "loss": 5.0448, "loss/crossentropy": 2.0009909868240356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835536003112793, "step": 2164 }, { "epoch": 0.04332, "grad_norm": 2.65625, "grad_norm_var": 0.017967732747395833, "learning_rate": 0.0001, "loss": 4.9035, "loss/crossentropy": 1.9848785400390625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26657119393348694, "step": 2166 }, { "epoch": 0.04336, "grad_norm": 2.734375, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 4.9108, "loss/crossentropy": 2.076065957546234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2801993191242218, "step": 2168 }, { "epoch": 0.0434, "grad_norm": 2.828125, "grad_norm_var": 0.011617024739583334, "learning_rate": 0.0001, "loss": 5.1425, "loss/crossentropy": 2.1208528876304626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3152369260787964, "step": 2170 }, { "epoch": 0.04344, "grad_norm": 2.625, "grad_norm_var": 0.011400349934895833, "learning_rate": 0.0001, "loss": 5.0608, "loss/crossentropy": 2.1971306204795837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31046128273010254, "step": 2172 }, { "epoch": 0.04348, "grad_norm": 2.734375, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 5.2445, "loss/crossentropy": 2.275176525115967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3090529441833496, "step": 2174 }, { "epoch": 0.04352, "grad_norm": 2.8125, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 4.9366, "loss/crossentropy": 2.1574501395225525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.280165433883667, "step": 2176 }, { "epoch": 0.04356, "grad_norm": 2.65625, "grad_norm_var": 0.012548828125, "learning_rate": 0.0001, "loss": 5.2338, "loss/crossentropy": 2.4236754179000854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3061629384756088, "step": 2178 }, { "epoch": 0.0436, "grad_norm": 2.78125, "grad_norm_var": 0.012809244791666667, "learning_rate": 0.0001, "loss": 5.1707, "loss/crossentropy": 2.1282758712768555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3050261586904526, "step": 2180 }, { "epoch": 0.04364, "grad_norm": 2.828125, "grad_norm_var": 0.008561197916666667, "learning_rate": 0.0001, "loss": 5.4629, "loss/crossentropy": 2.4244707822799683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33464157581329346, "step": 2182 }, { "epoch": 0.04368, "grad_norm": 2.640625, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 5.3891, "loss/crossentropy": 2.289917469024658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3181813210248947, "step": 2184 }, { "epoch": 0.04372, "grad_norm": 2.890625, "grad_norm_var": 0.01031494140625, "learning_rate": 0.0001, "loss": 5.4207, "loss/crossentropy": 2.1540024280548096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.298043891787529, "step": 2186 }, { "epoch": 0.04376, "grad_norm": 2.609375, "grad_norm_var": 0.0145660400390625, "learning_rate": 0.0001, "loss": 4.8595, "loss/crossentropy": 1.6615915298461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23485098034143448, "step": 2188 }, { "epoch": 0.0438, "grad_norm": 2.796875, "grad_norm_var": 0.01357421875, "learning_rate": 0.0001, "loss": 5.0595, "loss/crossentropy": 2.352560341358185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2965056747198105, "step": 2190 }, { "epoch": 0.04384, "grad_norm": 2.625, "grad_norm_var": 0.0183746337890625, "learning_rate": 0.0001, "loss": 5.1463, "loss/crossentropy": 2.0864007472991943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2859686613082886, "step": 2192 }, { "epoch": 0.04388, "grad_norm": 2.765625, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 5.2599, "loss/crossentropy": 1.8934992551803589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24994677305221558, "step": 2194 }, { "epoch": 0.04392, "grad_norm": 2.578125, "grad_norm_var": 0.02476806640625, "learning_rate": 0.0001, "loss": 4.9976, "loss/crossentropy": 2.2395824193954468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2719078063964844, "step": 2196 }, { "epoch": 0.04396, "grad_norm": 2.84375, "grad_norm_var": 0.02515869140625, "learning_rate": 0.0001, "loss": 5.2631, "loss/crossentropy": 2.089230954647064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2759791761636734, "step": 2198 }, { "epoch": 0.044, "grad_norm": 2.75, "grad_norm_var": 0.023567708333333333, "learning_rate": 0.0001, "loss": 5.298, "loss/crossentropy": 2.2770241498947144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31005042791366577, "step": 2200 }, { "epoch": 0.04404, "grad_norm": 2.46875, "grad_norm_var": 0.028938802083333333, "learning_rate": 0.0001, "loss": 4.6842, "loss/crossentropy": 2.067028760910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30712655186653137, "step": 2202 }, { "epoch": 0.04408, "grad_norm": 2.46875, "grad_norm_var": 0.026395670572916665, "learning_rate": 0.0001, "loss": 5.0557, "loss/crossentropy": 2.4397774934768677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3508765548467636, "step": 2204 }, { "epoch": 0.04412, "grad_norm": 2.890625, "grad_norm_var": 0.028880818684895834, "learning_rate": 0.0001, "loss": 4.966, "loss/crossentropy": 1.8136217594146729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495090439915657, "step": 2206 }, { "epoch": 0.04416, "grad_norm": 2.71875, "grad_norm_var": 0.022945149739583334, "learning_rate": 0.0001, "loss": 5.11, "loss/crossentropy": 2.4620203971862793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31419822573661804, "step": 2208 }, { "epoch": 0.0442, "grad_norm": 2.625, "grad_norm_var": 0.020042928059895833, "learning_rate": 0.0001, "loss": 4.9756, "loss/crossentropy": 1.8817986249923706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705220878124237, "step": 2210 }, { "epoch": 0.04424, "grad_norm": 3.125, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 5.0069, "loss/crossentropy": 1.9593598246574402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.262384794652462, "step": 2212 }, { "epoch": 0.04428, "grad_norm": 2.484375, "grad_norm_var": 0.03303120930989583, "learning_rate": 0.0001, "loss": 4.9133, "loss/crossentropy": 2.1003851294517517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28965799510478973, "step": 2214 }, { "epoch": 0.04432, "grad_norm": 3.46875, "grad_norm_var": 0.07888081868489584, "learning_rate": 0.0001, "loss": 5.3243, "loss/crossentropy": 2.23227858543396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3120953291654587, "step": 2216 }, { "epoch": 0.04436, "grad_norm": 2.625, "grad_norm_var": 0.0726226806640625, "learning_rate": 0.0001, "loss": 5.0901, "loss/crossentropy": 1.8880399465560913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2740217447280884, "step": 2218 }, { "epoch": 0.0444, "grad_norm": 2.5, "grad_norm_var": 0.07111714680989584, "learning_rate": 0.0001, "loss": 4.9444, "loss/crossentropy": 2.132355511188507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27536119520664215, "step": 2220 }, { "epoch": 0.04444, "grad_norm": 2.578125, "grad_norm_var": 0.07419331868489583, "learning_rate": 0.0001, "loss": 4.7325, "loss/crossentropy": 1.831633746623993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26966987550258636, "step": 2222 }, { "epoch": 0.04448, "grad_norm": 2.703125, "grad_norm_var": 0.08056538899739583, "learning_rate": 0.0001, "loss": 5.004, "loss/crossentropy": 2.066656529903412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27303647994995117, "step": 2224 }, { "epoch": 0.04452, "grad_norm": 2.75, "grad_norm_var": 0.0809234619140625, "learning_rate": 0.0001, "loss": 4.9265, "loss/crossentropy": 2.1416667699813843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2781240791082382, "step": 2226 }, { "epoch": 0.04456, "grad_norm": 2.671875, "grad_norm_var": 0.07366536458333334, "learning_rate": 0.0001, "loss": 5.0701, "loss/crossentropy": 1.7953566908836365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27528999745845795, "step": 2228 }, { "epoch": 0.0446, "grad_norm": 2.84375, "grad_norm_var": 0.06728515625, "learning_rate": 0.0001, "loss": 5.0182, "loss/crossentropy": 2.1580333709716797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2998732179403305, "step": 2230 }, { "epoch": 0.04464, "grad_norm": 2.9375, "grad_norm_var": 0.034326171875, "learning_rate": 0.0001, "loss": 5.3619, "loss/crossentropy": 2.1685701608657837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3457205891609192, "step": 2232 }, { "epoch": 0.04468, "grad_norm": 2.578125, "grad_norm_var": 0.03345438639322917, "learning_rate": 0.0001, "loss": 4.7602, "loss/crossentropy": 1.9424286484718323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2728651314973831, "step": 2234 }, { "epoch": 0.04472, "grad_norm": 2.765625, "grad_norm_var": 0.029736328125, "learning_rate": 0.0001, "loss": 5.2351, "loss/crossentropy": 2.2802772521972656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30141082406044006, "step": 2236 }, { "epoch": 0.04476, "grad_norm": 2.515625, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 4.8906, "loss/crossentropy": 1.9490987062454224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2573640048503876, "step": 2238 }, { "epoch": 0.0448, "grad_norm": 3.171875, "grad_norm_var": 0.037679036458333336, "learning_rate": 0.0001, "loss": 5.2112, "loss/crossentropy": 1.993924081325531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25628305971622467, "step": 2240 }, { "epoch": 0.04484, "grad_norm": 2.6875, "grad_norm_var": 0.0359283447265625, "learning_rate": 0.0001, "loss": 5.268, "loss/crossentropy": 2.5151875019073486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.326447993516922, "step": 2242 }, { "epoch": 0.04488, "grad_norm": 2.609375, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 5.0191, "loss/crossentropy": 2.5175565481185913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3036232739686966, "step": 2244 }, { "epoch": 0.04492, "grad_norm": 2.546875, "grad_norm_var": 0.05388997395833333, "learning_rate": 0.0001, "loss": 4.8489, "loss/crossentropy": 2.020721971988678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28087201714515686, "step": 2246 }, { "epoch": 0.04496, "grad_norm": 2.96875, "grad_norm_var": 0.045703125, "learning_rate": 0.0001, "loss": 5.6809, "loss/crossentropy": 2.4800511598587036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3100287467241287, "step": 2248 }, { "epoch": 0.045, "grad_norm": 2.59375, "grad_norm_var": 0.0449127197265625, "learning_rate": 0.0001, "loss": 4.9055, "loss/crossentropy": 1.826172411441803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2782330811023712, "step": 2250 }, { "epoch": 0.04504, "grad_norm": 2.828125, "grad_norm_var": 0.04533589680989583, "learning_rate": 0.0001, "loss": 5.133, "loss/crossentropy": 2.256316304206848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3070906698703766, "step": 2252 }, { "epoch": 0.04508, "grad_norm": 2.765625, "grad_norm_var": 0.041402180989583336, "learning_rate": 0.0001, "loss": 5.173, "loss/crossentropy": 1.9046601057052612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2894355356693268, "step": 2254 }, { "epoch": 0.04512, "grad_norm": 2.828125, "grad_norm_var": 0.03288472493489583, "learning_rate": 0.0001, "loss": 5.0311, "loss/crossentropy": 1.8359373211860657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705196440219879, "step": 2256 }, { "epoch": 0.04516, "grad_norm": 2.96875, "grad_norm_var": 0.0349273681640625, "learning_rate": 0.0001, "loss": 4.717, "loss/crossentropy": 2.096512258052826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26635921001434326, "step": 2258 }, { "epoch": 0.0452, "grad_norm": 2.984375, "grad_norm_var": 0.031371053059895834, "learning_rate": 0.0001, "loss": 5.6736, "loss/crossentropy": 2.4621278047561646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3513137400150299, "step": 2260 }, { "epoch": 0.04524, "grad_norm": 2.828125, "grad_norm_var": 0.020442708333333334, "learning_rate": 0.0001, "loss": 5.1413, "loss/crossentropy": 1.8345229029655457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2414976954460144, "step": 2262 }, { "epoch": 0.04528, "grad_norm": 2.65625, "grad_norm_var": 0.026432291666666666, "learning_rate": 0.0001, "loss": 4.9746, "loss/crossentropy": 2.24505877494812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31794628500938416, "step": 2264 }, { "epoch": 0.04532, "grad_norm": 2.609375, "grad_norm_var": 0.0285552978515625, "learning_rate": 0.0001, "loss": 5.1691, "loss/crossentropy": 2.2141382694244385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2925301343202591, "step": 2266 }, { "epoch": 0.04536, "grad_norm": 3.1875, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 5.0559, "loss/crossentropy": 2.1515613794326782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2843547910451889, "step": 2268 }, { "epoch": 0.0454, "grad_norm": 2.765625, "grad_norm_var": 0.03422749837239583, "learning_rate": 0.0001, "loss": 4.6899, "loss/crossentropy": 2.1234883666038513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981575280427933, "step": 2270 }, { "epoch": 0.04544, "grad_norm": 2.65625, "grad_norm_var": 0.03806864420572917, "learning_rate": 0.0001, "loss": 4.9871, "loss/crossentropy": 2.1212490797042847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.276496559381485, "step": 2272 }, { "epoch": 0.04548, "grad_norm": 2.921875, "grad_norm_var": 0.03574930826822917, "learning_rate": 0.0001, "loss": 5.2925, "loss/crossentropy": 2.4330636262893677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2815839499235153, "step": 2274 }, { "epoch": 0.04552, "grad_norm": 2.65625, "grad_norm_var": 0.028238932291666668, "learning_rate": 0.0001, "loss": 5.2874, "loss/crossentropy": 2.110591411590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2832919806241989, "step": 2276 }, { "epoch": 0.04556, "grad_norm": 2.875, "grad_norm_var": 0.20754801432291667, "learning_rate": 0.0001, "loss": 5.0303, "loss/crossentropy": 2.231989800930023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28183089196681976, "step": 2278 }, { "epoch": 0.0456, "grad_norm": 2.84375, "grad_norm_var": 0.194873046875, "learning_rate": 0.0001, "loss": 5.3746, "loss/crossentropy": 2.1275558471679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29279497265815735, "step": 2280 }, { "epoch": 0.04564, "grad_norm": 2.8125, "grad_norm_var": 0.1891510009765625, "learning_rate": 0.0001, "loss": 5.2023, "loss/crossentropy": 1.7988306283950806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2625636160373688, "step": 2282 }, { "epoch": 0.04568, "grad_norm": 2.65625, "grad_norm_var": 0.18694254557291667, "learning_rate": 0.0001, "loss": 5.2017, "loss/crossentropy": 2.3405990600585938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30855217576026917, "step": 2284 }, { "epoch": 0.04572, "grad_norm": 2.828125, "grad_norm_var": 0.18612874348958333, "learning_rate": 0.0001, "loss": 5.4582, "loss/crossentropy": 2.2062121629714966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30755001306533813, "step": 2286 }, { "epoch": 0.04576, "grad_norm": 2.5625, "grad_norm_var": 0.18968098958333332, "learning_rate": 0.0001, "loss": 4.8984, "loss/crossentropy": 1.9439310431480408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26844222843647003, "step": 2288 }, { "epoch": 0.0458, "grad_norm": 2.78125, "grad_norm_var": 0.19010416666666666, "learning_rate": 0.0001, "loss": 5.2097, "loss/crossentropy": 2.3106162548065186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30821681022644043, "step": 2290 }, { "epoch": 0.04584, "grad_norm": 2.640625, "grad_norm_var": 0.19246317545572916, "learning_rate": 0.0001, "loss": 5.1401, "loss/crossentropy": 2.3809561729431152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3307010233402252, "step": 2292 }, { "epoch": 0.04588, "grad_norm": 2.703125, "grad_norm_var": 0.01103515625, "learning_rate": 0.0001, "loss": 5.4066, "loss/crossentropy": 2.209702253341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2795914113521576, "step": 2294 }, { "epoch": 0.04592, "grad_norm": 2.5625, "grad_norm_var": 0.0158111572265625, "learning_rate": 0.0001, "loss": 4.8772, "loss/crossentropy": 2.4084372520446777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31208573281764984, "step": 2296 }, { "epoch": 0.04596, "grad_norm": 2.6875, "grad_norm_var": 0.0146392822265625, "learning_rate": 0.0001, "loss": 4.7757, "loss/crossentropy": 1.9384723901748657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2625032365322113, "step": 2298 }, { "epoch": 0.046, "grad_norm": 2.78125, "grad_norm_var": 0.013704427083333333, "learning_rate": 0.0001, "loss": 5.2455, "loss/crossentropy": 2.150592088699341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.278359517455101, "step": 2300 }, { "epoch": 0.04604, "grad_norm": 2.65625, "grad_norm_var": 0.01256103515625, "learning_rate": 0.0001, "loss": 5.0788, "loss/crossentropy": 1.8317970037460327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2927433103322983, "step": 2302 }, { "epoch": 0.04608, "grad_norm": 3.09375, "grad_norm_var": 0.029564412434895833, "learning_rate": 0.0001, "loss": 5.091, "loss/crossentropy": 2.323367118835449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2943577915430069, "step": 2304 }, { "epoch": 0.04612, "grad_norm": 2.859375, "grad_norm_var": 0.03943583170572917, "learning_rate": 0.0001, "loss": 5.5301, "loss/crossentropy": 2.369907855987549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2990037202835083, "step": 2306 }, { "epoch": 0.04616, "grad_norm": 2.921875, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 4.7508, "loss/crossentropy": 1.691443145275116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27280642092227936, "step": 2308 }, { "epoch": 0.0462, "grad_norm": 2.71875, "grad_norm_var": 0.0464508056640625, "learning_rate": 0.0001, "loss": 4.9413, "loss/crossentropy": 2.2883838415145874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3064710944890976, "step": 2310 }, { "epoch": 0.04624, "grad_norm": 2.671875, "grad_norm_var": 0.035380045572916664, "learning_rate": 0.0001, "loss": 5.4165, "loss/crossentropy": 2.2042444944381714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3241504430770874, "step": 2312 }, { "epoch": 0.04628, "grad_norm": 2.609375, "grad_norm_var": 0.048680623372395836, "learning_rate": 0.0001, "loss": 4.6657, "loss/crossentropy": 1.977793574333191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775610163807869, "step": 2314 }, { "epoch": 0.04632, "grad_norm": 24.875, "grad_norm_var": 30.570881144205728, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.034530758857727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27170561254024506, "step": 2316 }, { "epoch": 0.04636, "grad_norm": 2.875, "grad_norm_var": 30.404881795247395, "learning_rate": 0.0001, "loss": 5.1565, "loss/crossentropy": 2.439123511314392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2999258190393448, "step": 2318 }, { "epoch": 0.0464, "grad_norm": 2.5, "grad_norm_var": 30.530557250976564, "learning_rate": 0.0001, "loss": 4.8907, "loss/crossentropy": 2.2600624561309814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2952795475721359, "step": 2320 }, { "epoch": 0.04644, "grad_norm": 2.859375, "grad_norm_var": 30.544131469726562, "learning_rate": 0.0001, "loss": 5.0521, "loss/crossentropy": 2.144679367542267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28713342547416687, "step": 2322 }, { "epoch": 0.04648, "grad_norm": 2.890625, "grad_norm_var": 30.469155883789064, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.34474778175354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.292633980512619, "step": 2324 }, { "epoch": 0.04652, "grad_norm": 2.609375, "grad_norm_var": 30.491536458333332, "learning_rate": 0.0001, "loss": 4.5903, "loss/crossentropy": 1.96743243932724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.265610933303833, "step": 2326 }, { "epoch": 0.04656, "grad_norm": 2.53125, "grad_norm_var": 30.524051920572916, "learning_rate": 0.0001, "loss": 5.2353, "loss/crossentropy": 2.3895785808563232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3287513107061386, "step": 2328 }, { "epoch": 0.0466, "grad_norm": 2.984375, "grad_norm_var": 30.446451822916668, "learning_rate": 0.0001, "loss": 4.9017, "loss/crossentropy": 2.0607098937034607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30049796402454376, "step": 2330 }, { "epoch": 0.04664, "grad_norm": 2.96875, "grad_norm_var": 0.08322652180989583, "learning_rate": 0.0001, "loss": 5.1698, "loss/crossentropy": 2.162124752998352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2983020693063736, "step": 2332 }, { "epoch": 0.04668, "grad_norm": 2.890625, "grad_norm_var": 0.05139567057291667, "learning_rate": 0.0001, "loss": 5.2702, "loss/crossentropy": 2.34970760345459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3322293907403946, "step": 2334 }, { "epoch": 0.04672, "grad_norm": 2.703125, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 4.9763, "loss/crossentropy": 1.9286972284317017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752893418073654, "step": 2336 }, { "epoch": 0.04676, "grad_norm": 2.828125, "grad_norm_var": 0.04159749348958333, "learning_rate": 0.0001, "loss": 5.1677, "loss/crossentropy": 2.182044267654419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30808278918266296, "step": 2338 }, { "epoch": 0.0468, "grad_norm": 2.96875, "grad_norm_var": 0.03916727701822917, "learning_rate": 0.0001, "loss": 5.28, "loss/crossentropy": 2.0002610087394714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2944178581237793, "step": 2340 }, { "epoch": 0.04684, "grad_norm": 2.609375, "grad_norm_var": 0.034830729166666664, "learning_rate": 0.0001, "loss": 4.984, "loss/crossentropy": 2.0721842646598816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2769011855125427, "step": 2342 }, { "epoch": 0.04688, "grad_norm": 2.625, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 5.0911, "loss/crossentropy": 1.9710460305213928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28619086742401123, "step": 2344 }, { "epoch": 0.04692, "grad_norm": 3.28125, "grad_norm_var": 1.54605712890625, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.0506762266159058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31705181300640106, "step": 2346 }, { "epoch": 0.04696, "grad_norm": 2.609375, "grad_norm_var": 1.5601064046223958, "learning_rate": 0.0001, "loss": 5.0045, "loss/crossentropy": 2.0095930695533752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2831149846315384, "step": 2348 }, { "epoch": 0.047, "grad_norm": 2.890625, "grad_norm_var": 1.573631795247396, "learning_rate": 0.0001, "loss": 5.19, "loss/crossentropy": 2.023163616657257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2813292294740677, "step": 2350 }, { "epoch": 0.04704, "grad_norm": 2.671875, "grad_norm_var": 1.561424763997396, "learning_rate": 0.0001, "loss": 5.2907, "loss/crossentropy": 2.230435371398926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30891451239585876, "step": 2352 }, { "epoch": 0.04708, "grad_norm": 3.0625, "grad_norm_var": 1.5700154622395834, "learning_rate": 0.0001, "loss": 4.9261, "loss/crossentropy": 2.1553521156311035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29839709401130676, "step": 2354 }, { "epoch": 0.04712, "grad_norm": 2.96875, "grad_norm_var": 1.5770792643229166, "learning_rate": 0.0001, "loss": 5.2553, "loss/crossentropy": 2.175648272037506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2972148358821869, "step": 2356 }, { "epoch": 0.04716, "grad_norm": 2.703125, "grad_norm_var": 1.5980377197265625, "learning_rate": 0.0001, "loss": 5.0436, "loss/crossentropy": 2.3852503299713135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3166612535715103, "step": 2358 }, { "epoch": 0.0472, "grad_norm": 2.75, "grad_norm_var": 1.5826456705729166, "learning_rate": 0.0001, "loss": 5.1827, "loss/crossentropy": 2.1905999183654785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29487256705760956, "step": 2360 }, { "epoch": 0.04724, "grad_norm": 2.5625, "grad_norm_var": 0.0413482666015625, "learning_rate": 0.0001, "loss": 4.9295, "loss/crossentropy": 1.9224759340286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27821336686611176, "step": 2362 }, { "epoch": 0.04728, "grad_norm": 3.109375, "grad_norm_var": 0.04057515462239583, "learning_rate": 0.0001, "loss": 5.1078, "loss/crossentropy": 2.5025261640548706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3345927745103836, "step": 2364 }, { "epoch": 0.04732, "grad_norm": 2.640625, "grad_norm_var": 0.04006245930989583, "learning_rate": 0.0001, "loss": 5.0502, "loss/crossentropy": 2.2385451793670654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27095621824264526, "step": 2366 }, { "epoch": 0.04736, "grad_norm": 4.5, "grad_norm_var": 0.22924702962239582, "learning_rate": 0.0001, "loss": 5.2761, "loss/crossentropy": 2.0266553163528442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2856762409210205, "step": 2368 }, { "epoch": 0.0474, "grad_norm": 3.359375, "grad_norm_var": 0.24001363118489583, "learning_rate": 0.0001, "loss": 5.4918, "loss/crossentropy": 2.5139355659484863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3186872750520706, "step": 2370 }, { "epoch": 0.04744, "grad_norm": 2.609375, "grad_norm_var": 0.24321187337239583, "learning_rate": 0.0001, "loss": 5.1302, "loss/crossentropy": 2.066476881504059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2889470160007477, "step": 2372 }, { "epoch": 0.04748, "grad_norm": 2.671875, "grad_norm_var": 0.23772786458333334, "learning_rate": 0.0001, "loss": 5.0031, "loss/crossentropy": 2.0537307262420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31795741617679596, "step": 2374 }, { "epoch": 0.04752, "grad_norm": 2.546875, "grad_norm_var": 0.24492085774739583, "learning_rate": 0.0001, "loss": 4.9922, "loss/crossentropy": 1.9254986643791199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25411880016326904, "step": 2376 }, { "epoch": 0.04756, "grad_norm": 2.671875, "grad_norm_var": 0.23855692545572918, "learning_rate": 0.0001, "loss": 5.0284, "loss/crossentropy": 2.221043348312378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28534361720085144, "step": 2378 }, { "epoch": 0.0476, "grad_norm": 2.90625, "grad_norm_var": 0.228271484375, "learning_rate": 0.0001, "loss": 5.2106, "loss/crossentropy": 2.3516281843185425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3128499984741211, "step": 2380 }, { "epoch": 0.04764, "grad_norm": 2.734375, "grad_norm_var": 0.22349853515625, "learning_rate": 0.0001, "loss": 5.6266, "loss/crossentropy": 2.2144338488578796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3071902245283127, "step": 2382 }, { "epoch": 0.04768, "grad_norm": 2.625, "grad_norm_var": 0.0567291259765625, "learning_rate": 0.0001, "loss": 5.2429, "loss/crossentropy": 2.3324203491210938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.322970449924469, "step": 2384 }, { "epoch": 0.04772, "grad_norm": 2.546875, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 4.7732, "loss/crossentropy": 2.08349871635437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2774003893136978, "step": 2386 }, { "epoch": 0.04776, "grad_norm": 2.4375, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 4.8585, "loss/crossentropy": 1.9565780758857727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29790589213371277, "step": 2388 }, { "epoch": 0.0478, "grad_norm": 2.625, "grad_norm_var": 0.022782389322916666, "learning_rate": 0.0001, "loss": 5.0176, "loss/crossentropy": 2.261398434638977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3143853694200516, "step": 2390 }, { "epoch": 0.04784, "grad_norm": 2.96875, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 5.0688, "loss/crossentropy": 1.9077460169792175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25433091819286346, "step": 2392 }, { "epoch": 0.04788, "grad_norm": 2.65625, "grad_norm_var": 0.0240142822265625, "learning_rate": 0.0001, "loss": 4.9531, "loss/crossentropy": 1.9948468208312988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598777562379837, "step": 2394 }, { "epoch": 0.04792, "grad_norm": 2.9375, "grad_norm_var": 0.45455729166666664, "learning_rate": 0.0001, "loss": 5.0972, "loss/crossentropy": 2.1177526116371155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775426208972931, "step": 2396 }, { "epoch": 0.04796, "grad_norm": 2.5, "grad_norm_var": 0.4634348551432292, "learning_rate": 0.0001, "loss": 4.8571, "loss/crossentropy": 2.1756062507629395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33678852021694183, "step": 2398 }, { "epoch": 0.048, "grad_norm": 2.984375, "grad_norm_var": 0.4576171875, "learning_rate": 0.0001, "loss": 5.2617, "loss/crossentropy": 2.0923725366592407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3255026638507843, "step": 2400 }, { "epoch": 0.04804, "grad_norm": 2.84375, "grad_norm_var": 0.4471181233723958, "learning_rate": 0.0001, "loss": 4.9421, "loss/crossentropy": 1.9236682653427124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2642124071717262, "step": 2402 }, { "epoch": 0.04808, "grad_norm": 2.84375, "grad_norm_var": 0.43757222493489584, "learning_rate": 0.0001, "loss": 5.0604, "loss/crossentropy": 2.1742242574691772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30006398260593414, "step": 2404 }, { "epoch": 0.04812, "grad_norm": 2.6875, "grad_norm_var": 0.43835347493489585, "learning_rate": 0.0001, "loss": 4.7077, "loss/crossentropy": 1.7445701956748962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25791122019290924, "step": 2406 }, { "epoch": 0.04816, "grad_norm": 4.34375, "grad_norm_var": 0.5614735921223958, "learning_rate": 0.0001, "loss": 5.1289, "loss/crossentropy": 1.8616467714309692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532489001750946, "step": 2408 }, { "epoch": 0.0482, "grad_norm": 3.203125, "grad_norm_var": 0.5608723958333334, "learning_rate": 0.0001, "loss": 5.0486, "loss/crossentropy": 1.9146783351898193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2659924626350403, "step": 2410 }, { "epoch": 0.04824, "grad_norm": 2.859375, "grad_norm_var": 0.21818745930989583, "learning_rate": 0.0001, "loss": 5.0972, "loss/crossentropy": 2.179564118385315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29064056277275085, "step": 2412 }, { "epoch": 0.04828, "grad_norm": 2.65625, "grad_norm_var": 0.21357421875, "learning_rate": 0.0001, "loss": 4.9578, "loss/crossentropy": 2.04409658908844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27891072630882263, "step": 2414 }, { "epoch": 0.04832, "grad_norm": 2.625, "grad_norm_var": 0.22568257649739584, "learning_rate": 0.0001, "loss": 4.8833, "loss/crossentropy": 2.590337038040161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3158426731824875, "step": 2416 }, { "epoch": 0.04836, "grad_norm": 2.625, "grad_norm_var": 0.23155924479166667, "learning_rate": 0.0001, "loss": 4.6919, "loss/crossentropy": 1.8753941059112549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2669401317834854, "step": 2418 }, { "epoch": 0.0484, "grad_norm": 2.734375, "grad_norm_var": 0.23361714680989584, "learning_rate": 0.0001, "loss": 5.0231, "loss/crossentropy": 2.1412659287452698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2839512526988983, "step": 2420 }, { "epoch": 0.04844, "grad_norm": 2.65625, "grad_norm_var": 0.23371988932291668, "learning_rate": 0.0001, "loss": 5.1187, "loss/crossentropy": 2.545991063117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3331379294395447, "step": 2422 }, { "epoch": 0.04848, "grad_norm": 3.25, "grad_norm_var": 0.089599609375, "learning_rate": 0.0001, "loss": 5.1246, "loss/crossentropy": 2.12838077545166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3043065369129181, "step": 2424 }, { "epoch": 0.04852, "grad_norm": 2.953125, "grad_norm_var": 0.03660380045572917, "learning_rate": 0.0001, "loss": 5.3821, "loss/crossentropy": 2.1983221769332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2846619784832001, "step": 2426 }, { "epoch": 0.04856, "grad_norm": 3.0625, "grad_norm_var": 0.03819986979166667, "learning_rate": 0.0001, "loss": 5.1044, "loss/crossentropy": 2.241136312484741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30275705456733704, "step": 2428 }, { "epoch": 0.0486, "grad_norm": 2.890625, "grad_norm_var": 0.03831278483072917, "learning_rate": 0.0001, "loss": 5.203, "loss/crossentropy": 2.097459554672241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2986721396446228, "step": 2430 }, { "epoch": 0.04864, "grad_norm": 4.4375, "grad_norm_var": 0.20159403483072916, "learning_rate": 0.0001, "loss": 5.1158, "loss/crossentropy": 2.333081007003784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2975248098373413, "step": 2432 }, { "epoch": 0.04868, "grad_norm": 2.625, "grad_norm_var": 0.19758707682291668, "learning_rate": 0.0001, "loss": 4.9183, "loss/crossentropy": 2.3171510696411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28452740609645844, "step": 2434 }, { "epoch": 0.04872, "grad_norm": 2.375, "grad_norm_var": 0.2141754150390625, "learning_rate": 0.0001, "loss": 4.8853, "loss/crossentropy": 1.8334497213363647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25271379947662354, "step": 2436 }, { "epoch": 0.04876, "grad_norm": 2.78125, "grad_norm_var": 0.20258687337239584, "learning_rate": 0.0001, "loss": 5.3809, "loss/crossentropy": 2.2712661027908325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30827929079532623, "step": 2438 }, { "epoch": 0.0488, "grad_norm": 3.3125, "grad_norm_var": 0.20465087890625, "learning_rate": 0.0001, "loss": 5.0863, "loss/crossentropy": 2.160263180732727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3078690320253372, "step": 2440 }, { "epoch": 0.04884, "grad_norm": 2.96875, "grad_norm_var": 0.20429585774739584, "learning_rate": 0.0001, "loss": 5.2224, "loss/crossentropy": 2.071319878101349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2733730524778366, "step": 2442 }, { "epoch": 0.04888, "grad_norm": 2.796875, "grad_norm_var": 0.203076171875, "learning_rate": 0.0001, "loss": 5.1476, "loss/crossentropy": 2.0742560029029846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835587412118912, "step": 2444 }, { "epoch": 0.04892, "grad_norm": 2.765625, "grad_norm_var": 0.24807535807291667, "learning_rate": 0.0001, "loss": 5.0211, "loss/crossentropy": 1.8836837410926819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3089100867509842, "step": 2446 }, { "epoch": 0.04896, "grad_norm": 2.875, "grad_norm_var": 0.177099609375, "learning_rate": 0.0001, "loss": 4.9537, "loss/crossentropy": 1.8539315462112427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697101980447769, "step": 2448 }, { "epoch": 0.049, "grad_norm": 2.828125, "grad_norm_var": 0.17392578125, "learning_rate": 0.0001, "loss": 5.1907, "loss/crossentropy": 2.219490647315979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2898380011320114, "step": 2450 }, { "epoch": 0.04904, "grad_norm": 3.1875, "grad_norm_var": 0.13853251139322917, "learning_rate": 0.0001, "loss": 5.468, "loss/crossentropy": 2.328765392303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.4013051837682724, "step": 2452 }, { "epoch": 0.04908, "grad_norm": 2.5625, "grad_norm_var": 0.14879557291666667, "learning_rate": 0.0001, "loss": 4.8967, "loss/crossentropy": 1.9204192161560059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25281261652708054, "step": 2454 }, { "epoch": 0.04912, "grad_norm": 2.765625, "grad_norm_var": 0.15563151041666667, "learning_rate": 0.0001, "loss": 5.0935, "loss/crossentropy": 2.377043604850769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2928486764431, "step": 2456 }, { "epoch": 0.04916, "grad_norm": 2.796875, "grad_norm_var": 0.15930989583333333, "learning_rate": 0.0001, "loss": 5.4528, "loss/crossentropy": 2.4364209175109863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3111976683139801, "step": 2458 }, { "epoch": 0.0492, "grad_norm": 2.703125, "grad_norm_var": 0.15985921223958333, "learning_rate": 0.0001, "loss": 5.1357, "loss/crossentropy": 2.3738330602645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3019126206636429, "step": 2460 }, { "epoch": 0.04924, "grad_norm": 2.765625, "grad_norm_var": 0.1141265869140625, "learning_rate": 0.0001, "loss": 5.2876, "loss/crossentropy": 2.4575772285461426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3428986072540283, "step": 2462 }, { "epoch": 0.04928, "grad_norm": 2.734375, "grad_norm_var": 0.028888956705729166, "learning_rate": 0.0001, "loss": 4.6505, "loss/crossentropy": 1.9849627017974854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27611130475997925, "step": 2464 }, { "epoch": 0.04932, "grad_norm": 2.625, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 4.8482, "loss/crossentropy": 1.9617170691490173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.257377490401268, "step": 2466 }, { "epoch": 0.04936, "grad_norm": 2.640625, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 5.1739, "loss/crossentropy": 2.4031273126602173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28843145072460175, "step": 2468 }, { "epoch": 0.0494, "grad_norm": 2.484375, "grad_norm_var": 0.0165435791015625, "learning_rate": 0.0001, "loss": 4.8452, "loss/crossentropy": 1.7263792753219604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25591571629047394, "step": 2470 }, { "epoch": 0.04944, "grad_norm": 2.734375, "grad_norm_var": 0.0191802978515625, "learning_rate": 0.0001, "loss": 4.7154, "loss/crossentropy": 2.106898784637451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.280623197555542, "step": 2472 }, { "epoch": 0.04948, "grad_norm": 2.6875, "grad_norm_var": 0.018358357747395835, "learning_rate": 0.0001, "loss": 5.0092, "loss/crossentropy": 2.325208902359009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27794161438941956, "step": 2474 }, { "epoch": 0.04952, "grad_norm": 2.65625, "grad_norm_var": 0.013736979166666666, "learning_rate": 0.0001, "loss": 5.1128, "loss/crossentropy": 2.367414712905884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31346653401851654, "step": 2476 }, { "epoch": 0.04956, "grad_norm": 2.59375, "grad_norm_var": 0.013792928059895833, "learning_rate": 0.0001, "loss": 5.2611, "loss/crossentropy": 2.191115140914917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30560287833213806, "step": 2478 }, { "epoch": 0.0496, "grad_norm": 2.765625, "grad_norm_var": 0.012430826822916666, "learning_rate": 0.0001, "loss": 4.9993, "loss/crossentropy": 2.2559624314308167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3029457628726959, "step": 2480 }, { "epoch": 0.04964, "grad_norm": 3.390625, "grad_norm_var": 0.04160868326822917, "learning_rate": 0.0001, "loss": 5.125, "loss/crossentropy": 2.0088382363319397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2923784404993057, "step": 2482 }, { "epoch": 0.04968, "grad_norm": 2.71875, "grad_norm_var": 0.03943583170572917, "learning_rate": 0.0001, "loss": 4.8858, "loss/crossentropy": 1.8445329070091248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2587483897805214, "step": 2484 }, { "epoch": 0.04972, "grad_norm": 2.8125, "grad_norm_var": 0.03534749348958333, "learning_rate": 0.0001, "loss": 4.7918, "loss/crossentropy": 2.015140950679779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27480585873126984, "step": 2486 }, { "epoch": 0.04976, "grad_norm": 2.765625, "grad_norm_var": 0.030833943684895834, "learning_rate": 0.0001, "loss": 5.1959, "loss/crossentropy": 1.918801188468933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28786011040210724, "step": 2488 }, { "epoch": 0.0498, "grad_norm": 2.734375, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 5.0108, "loss/crossentropy": 1.9899121522903442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25822708010673523, "step": 2490 }, { "epoch": 0.04984, "grad_norm": 2.59375, "grad_norm_var": 0.037206013997395836, "learning_rate": 0.0001, "loss": 4.7259, "loss/crossentropy": 2.3775535821914673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31345370411872864, "step": 2492 }, { "epoch": 0.04988, "grad_norm": 2.71875, "grad_norm_var": 0.03791402180989583, "learning_rate": 0.0001, "loss": 4.9496, "loss/crossentropy": 2.0874632596969604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2836592495441437, "step": 2494 }, { "epoch": 0.04992, "grad_norm": 2.78125, "grad_norm_var": 0.046019490559895834, "learning_rate": 0.0001, "loss": 5.2404, "loss/crossentropy": 2.226976454257965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27826404571533203, "step": 2496 }, { "epoch": 0.04996, "grad_norm": 2.71875, "grad_norm_var": 0.023900349934895832, "learning_rate": 0.0001, "loss": 5.1503, "loss/crossentropy": 2.4569294452667236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31981319189071655, "step": 2498 }, { "epoch": 0.05, "grad_norm": 2.703125, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 4.922, "loss/crossentropy": 2.1134212017059326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28260529041290283, "step": 2500 }, { "epoch": 0.05004, "grad_norm": 3.0, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 5.3555, "loss/crossentropy": 2.2914888858795166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3182682394981384, "step": 2502 }, { "epoch": 0.05008, "grad_norm": 2.65625, "grad_norm_var": 0.030078125, "learning_rate": 0.0001, "loss": 4.9644, "loss/crossentropy": 2.3261003494262695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30558250844478607, "step": 2504 }, { "epoch": 0.05012, "grad_norm": 2.703125, "grad_norm_var": 0.030598958333333332, "learning_rate": 0.0001, "loss": 4.9517, "loss/crossentropy": 2.351989507675171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2934701144695282, "step": 2506 }, { "epoch": 0.05016, "grad_norm": 2.5625, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 4.71, "loss/crossentropy": 1.8742690086364746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26305729895830154, "step": 2508 }, { "epoch": 0.0502, "grad_norm": 2.78125, "grad_norm_var": 0.0247222900390625, "learning_rate": 0.0001, "loss": 4.8214, "loss/crossentropy": 2.1668856143951416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2728075534105301, "step": 2510 }, { "epoch": 0.05024, "grad_norm": 2.734375, "grad_norm_var": 0.0137115478515625, "learning_rate": 0.0001, "loss": 4.7171, "loss/crossentropy": 1.773424208164215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26808495819568634, "step": 2512 }, { "epoch": 0.05028, "grad_norm": 2.59375, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 4.9224, "loss/crossentropy": 1.7541643977165222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2600491940975189, "step": 2514 }, { "epoch": 0.05032, "grad_norm": 2.796875, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 5.129, "loss/crossentropy": 1.9693496227264404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2844541072845459, "step": 2516 }, { "epoch": 0.05036, "grad_norm": 2.765625, "grad_norm_var": 0.009065755208333333, "learning_rate": 0.0001, "loss": 4.9202, "loss/crossentropy": 1.7539461851119995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542608380317688, "step": 2518 }, { "epoch": 0.0504, "grad_norm": 2.984375, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 5.0686, "loss/crossentropy": 2.155138313770294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3023010194301605, "step": 2520 }, { "epoch": 0.05044, "grad_norm": 2.609375, "grad_norm_var": 0.018619791666666666, "learning_rate": 0.0001, "loss": 4.9674, "loss/crossentropy": 2.069350838661194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26573850214481354, "step": 2522 }, { "epoch": 0.05048, "grad_norm": 2.5, "grad_norm_var": 0.022101847330729167, "learning_rate": 0.0001, "loss": 5.0295, "loss/crossentropy": 2.1864534616470337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27845603227615356, "step": 2524 }, { "epoch": 0.05052, "grad_norm": 2.5, "grad_norm_var": 0.023509724934895834, "learning_rate": 0.0001, "loss": 5.1173, "loss/crossentropy": 2.2462135553359985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29221346974372864, "step": 2526 }, { "epoch": 0.05056, "grad_norm": 2.78125, "grad_norm_var": 0.030492146809895832, "learning_rate": 0.0001, "loss": 5.0027, "loss/crossentropy": 2.043266773223877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25857964158058167, "step": 2528 }, { "epoch": 0.0506, "grad_norm": 2.65625, "grad_norm_var": 0.0299957275390625, "learning_rate": 0.0001, "loss": 4.9672, "loss/crossentropy": 1.8892702460289001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2879898101091385, "step": 2530 }, { "epoch": 0.05064, "grad_norm": 2.46875, "grad_norm_var": 0.03238525390625, "learning_rate": 0.0001, "loss": 4.5332, "loss/crossentropy": 1.9220558404922485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26652154326438904, "step": 2532 }, { "epoch": 0.05068, "grad_norm": 2.515625, "grad_norm_var": 0.03430989583333333, "learning_rate": 0.0001, "loss": 4.4176, "loss/crossentropy": 1.7282914519309998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2367371916770935, "step": 2534 }, { "epoch": 0.05072, "grad_norm": 2.78125, "grad_norm_var": 0.021092732747395832, "learning_rate": 0.0001, "loss": 4.9589, "loss/crossentropy": 1.729803204536438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25624871999025345, "step": 2536 }, { "epoch": 0.05076, "grad_norm": 2.578125, "grad_norm_var": 0.020873006184895834, "learning_rate": 0.0001, "loss": 4.6952, "loss/crossentropy": 2.0921449661254883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855897545814514, "step": 2538 }, { "epoch": 0.0508, "grad_norm": 2.6875, "grad_norm_var": 0.015404256184895833, "learning_rate": 0.0001, "loss": 4.8967, "loss/crossentropy": 2.0569751858711243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2829667925834656, "step": 2540 }, { "epoch": 0.05084, "grad_norm": 2.609375, "grad_norm_var": 0.015973917643229165, "learning_rate": 0.0001, "loss": 5.2438, "loss/crossentropy": 1.983904242515564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30939212441444397, "step": 2542 }, { "epoch": 0.05088, "grad_norm": 2.703125, "grad_norm_var": 0.011864217122395833, "learning_rate": 0.0001, "loss": 4.9968, "loss/crossentropy": 2.2631462812423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27867285907268524, "step": 2544 }, { "epoch": 0.05092, "grad_norm": 3.078125, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 4.9621, "loss/crossentropy": 1.9918989539146423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27703428268432617, "step": 2546 }, { "epoch": 0.05096, "grad_norm": 2.78125, "grad_norm_var": 0.021110026041666667, "learning_rate": 0.0001, "loss": 5.2041, "loss/crossentropy": 2.1356931924819946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27815964818000793, "step": 2548 }, { "epoch": 0.051, "grad_norm": 2.46875, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 4.6514, "loss/crossentropy": 2.172751545906067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2971910834312439, "step": 2550 }, { "epoch": 0.05104, "grad_norm": 2.578125, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 5.0178, "loss/crossentropy": 2.0799094438552856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30459292232990265, "step": 2552 }, { "epoch": 0.05108, "grad_norm": 2.5, "grad_norm_var": 0.0211090087890625, "learning_rate": 0.0001, "loss": 4.8235, "loss/crossentropy": 1.7769129872322083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2526697665452957, "step": 2554 }, { "epoch": 0.05112, "grad_norm": 2.8125, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 4.8457, "loss/crossentropy": 2.044790804386139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2821648418903351, "step": 2556 }, { "epoch": 0.05116, "grad_norm": 2.921875, "grad_norm_var": 0.0287109375, "learning_rate": 0.0001, "loss": 5.5336, "loss/crossentropy": 2.3708614110946655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30810464918613434, "step": 2558 }, { "epoch": 0.0512, "grad_norm": 2.71875, "grad_norm_var": 0.028709920247395833, "learning_rate": 0.0001, "loss": 5.2385, "loss/crossentropy": 2.2216718196868896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2919304668903351, "step": 2560 }, { "epoch": 0.05124, "grad_norm": 2.671875, "grad_norm_var": 0.019466145833333334, "learning_rate": 0.0001, "loss": 5.28, "loss/crossentropy": 2.4692097902297974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30620162189006805, "step": 2562 }, { "epoch": 0.05128, "grad_norm": 2.984375, "grad_norm_var": 0.026936848958333332, "learning_rate": 0.0001, "loss": 4.9476, "loss/crossentropy": 2.0491623282432556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25617313385009766, "step": 2564 }, { "epoch": 0.05132, "grad_norm": 2.8125, "grad_norm_var": 0.023368326822916667, "learning_rate": 0.0001, "loss": 4.958, "loss/crossentropy": 1.8305597305297852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25237561762332916, "step": 2566 }, { "epoch": 0.05136, "grad_norm": 2.640625, "grad_norm_var": 0.0223297119140625, "learning_rate": 0.0001, "loss": 4.9853, "loss/crossentropy": 1.9471853971481323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2712964415550232, "step": 2568 }, { "epoch": 0.0514, "grad_norm": 2.765625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 5.0932, "loss/crossentropy": 2.575412631034851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3106851130723953, "step": 2570 }, { "epoch": 0.05144, "grad_norm": 2.515625, "grad_norm_var": 0.020686848958333334, "learning_rate": 0.0001, "loss": 4.6755, "loss/crossentropy": 2.0210241079330444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269680991768837, "step": 2572 }, { "epoch": 0.05148, "grad_norm": 2.4375, "grad_norm_var": 0.02076416015625, "learning_rate": 0.0001, "loss": 4.6308, "loss/crossentropy": 1.9054389595985413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24933087825775146, "step": 2574 }, { "epoch": 0.05152, "grad_norm": 2.625, "grad_norm_var": 0.021284993489583334, "learning_rate": 0.0001, "loss": 4.9682, "loss/crossentropy": 2.142069697380066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3162301778793335, "step": 2576 }, { "epoch": 0.05156, "grad_norm": 2.59375, "grad_norm_var": 0.0197662353515625, "learning_rate": 0.0001, "loss": 5.018, "loss/crossentropy": 1.9952309727668762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560836151242256, "step": 2578 }, { "epoch": 0.0516, "grad_norm": 2.765625, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 4.8889, "loss/crossentropy": 2.0579317212104797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26214616745710373, "step": 2580 }, { "epoch": 0.05164, "grad_norm": 2.46875, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 4.6306, "loss/crossentropy": 2.076499104499817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26244185864925385, "step": 2582 }, { "epoch": 0.05168, "grad_norm": 2.890625, "grad_norm_var": 0.026146443684895833, "learning_rate": 0.0001, "loss": 4.9393, "loss/crossentropy": 2.2277501821517944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32310059666633606, "step": 2584 }, { "epoch": 0.05172, "grad_norm": 2.53125, "grad_norm_var": 0.03141988118489583, "learning_rate": 0.0001, "loss": 5.0929, "loss/crossentropy": 2.101436138153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26044395565986633, "step": 2586 }, { "epoch": 0.05176, "grad_norm": 2.703125, "grad_norm_var": 0.030939737955729168, "learning_rate": 0.0001, "loss": 5.045, "loss/crossentropy": 2.2617305517196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2768043726682663, "step": 2588 }, { "epoch": 0.0518, "grad_norm": 2.515625, "grad_norm_var": 0.0349029541015625, "learning_rate": 0.0001, "loss": 4.989, "loss/crossentropy": 2.2669495940208435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2941794842481613, "step": 2590 }, { "epoch": 0.05184, "grad_norm": 2.546875, "grad_norm_var": 0.03871968587239583, "learning_rate": 0.0001, "loss": 4.7676, "loss/crossentropy": 1.8102391958236694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24766983091831207, "step": 2592 }, { "epoch": 0.05188, "grad_norm": 2.53125, "grad_norm_var": 0.04006754557291667, "learning_rate": 0.0001, "loss": 5.0022, "loss/crossentropy": 2.1426219940185547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2808763086795807, "step": 2594 }, { "epoch": 0.05192, "grad_norm": 2.75, "grad_norm_var": 0.0368804931640625, "learning_rate": 0.0001, "loss": 4.8649, "loss/crossentropy": 2.0731321573257446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28768520057201385, "step": 2596 }, { "epoch": 0.05196, "grad_norm": 2.46875, "grad_norm_var": 0.03619384765625, "learning_rate": 0.0001, "loss": 4.8019, "loss/crossentropy": 1.8902159333229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27209727466106415, "step": 2598 }, { "epoch": 0.052, "grad_norm": 2.390625, "grad_norm_var": 0.030855305989583335, "learning_rate": 0.0001, "loss": 4.837, "loss/crossentropy": 2.1860097646713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3191404938697815, "step": 2600 }, { "epoch": 0.05204, "grad_norm": 2.78125, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 4.8471, "loss/crossentropy": 2.1010658740997314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2924908995628357, "step": 2602 }, { "epoch": 0.05208, "grad_norm": 2.71875, "grad_norm_var": 0.0285064697265625, "learning_rate": 0.0001, "loss": 5.1725, "loss/crossentropy": 2.0668399930000305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2691944092512131, "step": 2604 }, { "epoch": 0.05212, "grad_norm": 2.8125, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 5.0115, "loss/crossentropy": 2.310541272163391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31448885798454285, "step": 2606 }, { "epoch": 0.05216, "grad_norm": 2.75, "grad_norm_var": 0.0222808837890625, "learning_rate": 0.0001, "loss": 4.7731, "loss/crossentropy": 2.023577332496643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28528447449207306, "step": 2608 }, { "epoch": 0.0522, "grad_norm": 2.5625, "grad_norm_var": 0.0217193603515625, "learning_rate": 0.0001, "loss": 4.8408, "loss/crossentropy": 2.0232901573181152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25540125370025635, "step": 2610 }, { "epoch": 0.05224, "grad_norm": 2.5625, "grad_norm_var": 0.019554646809895833, "learning_rate": 0.0001, "loss": 4.8374, "loss/crossentropy": 2.147561550140381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2914447784423828, "step": 2612 }, { "epoch": 0.05228, "grad_norm": 2.46875, "grad_norm_var": 0.0193359375, "learning_rate": 0.0001, "loss": 4.8044, "loss/crossentropy": 1.9136184453964233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27879445254802704, "step": 2614 }, { "epoch": 0.05232, "grad_norm": 3.03125, "grad_norm_var": 0.0606842041015625, "learning_rate": 0.0001, "loss": 5.1245, "loss/crossentropy": 2.1118494272232056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2990800142288208, "step": 2616 }, { "epoch": 0.05236, "grad_norm": 2.640625, "grad_norm_var": 0.0599761962890625, "learning_rate": 0.0001, "loss": 5.2189, "loss/crossentropy": 1.9564325213432312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2805679142475128, "step": 2618 }, { "epoch": 0.0524, "grad_norm": 3.046875, "grad_norm_var": 0.06641337076822916, "learning_rate": 0.0001, "loss": 4.9511, "loss/crossentropy": 2.0683051347732544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2906472980976105, "step": 2620 }, { "epoch": 0.05244, "grad_norm": 2.609375, "grad_norm_var": 0.06653645833333334, "learning_rate": 0.0001, "loss": 5.058, "loss/crossentropy": 2.0510823130607605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28148986399173737, "step": 2622 }, { "epoch": 0.05248, "grad_norm": 2.640625, "grad_norm_var": 0.06256103515625, "learning_rate": 0.0001, "loss": 5.2113, "loss/crossentropy": 2.2972904443740845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32902073860168457, "step": 2624 }, { "epoch": 0.05252, "grad_norm": 2.890625, "grad_norm_var": 0.0582427978515625, "learning_rate": 0.0001, "loss": 5.0939, "loss/crossentropy": 1.9502179026603699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2844041585922241, "step": 2626 }, { "epoch": 0.05256, "grad_norm": 2.640625, "grad_norm_var": 0.05607808430989583, "learning_rate": 0.0001, "loss": 5.1078, "loss/crossentropy": 2.1577298045158386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2673380598425865, "step": 2628 }, { "epoch": 0.0526, "grad_norm": 2.59375, "grad_norm_var": 0.05271708170572917, "learning_rate": 0.0001, "loss": 5.0514, "loss/crossentropy": 2.1707664132118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697260081768036, "step": 2630 }, { "epoch": 0.05264, "grad_norm": 2.71875, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 4.9617, "loss/crossentropy": 2.0975311398506165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27033862471580505, "step": 2632 }, { "epoch": 0.05268, "grad_norm": 3.25, "grad_norm_var": 0.033869425455729164, "learning_rate": 0.0001, "loss": 5.1841, "loss/crossentropy": 2.197197914123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28717951476573944, "step": 2634 }, { "epoch": 0.05272, "grad_norm": 2.765625, "grad_norm_var": 0.03570556640625, "learning_rate": 0.0001, "loss": 5.2558, "loss/crossentropy": 2.2898266315460205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31359314918518066, "step": 2636 }, { "epoch": 0.05276, "grad_norm": 2.796875, "grad_norm_var": 0.0347564697265625, "learning_rate": 0.0001, "loss": 5.4849, "loss/crossentropy": 2.525710701942444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3199215829372406, "step": 2638 }, { "epoch": 0.0528, "grad_norm": 2.796875, "grad_norm_var": 0.03349609375, "learning_rate": 0.0001, "loss": 5.0495, "loss/crossentropy": 2.2799761295318604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28715676069259644, "step": 2640 }, { "epoch": 0.05284, "grad_norm": 2.796875, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 4.9837, "loss/crossentropy": 1.8681190013885498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25435833632946014, "step": 2642 }, { "epoch": 0.05288, "grad_norm": 2.578125, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 4.807, "loss/crossentropy": 1.9067540168762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25444281101226807, "step": 2644 }, { "epoch": 0.05292, "grad_norm": 2.53125, "grad_norm_var": 0.03732096354166667, "learning_rate": 0.0001, "loss": 5.0326, "loss/crossentropy": 2.370971202850342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31173495948314667, "step": 2646 }, { "epoch": 0.05296, "grad_norm": 2.671875, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 5.1148, "loss/crossentropy": 2.1829749941825867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29970400035381317, "step": 2648 }, { "epoch": 0.053, "grad_norm": 2.609375, "grad_norm_var": 0.020052083333333335, "learning_rate": 0.0001, "loss": 5.1086, "loss/crossentropy": 2.0818498134613037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27712464332580566, "step": 2650 }, { "epoch": 0.05304, "grad_norm": 2.609375, "grad_norm_var": 0.010789998372395833, "learning_rate": 0.0001, "loss": 5.1071, "loss/crossentropy": 2.2080377340316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27552157640457153, "step": 2652 }, { "epoch": 0.05308, "grad_norm": 2.53125, "grad_norm_var": 0.01109619140625, "learning_rate": 0.0001, "loss": 4.9685, "loss/crossentropy": 1.7115904092788696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23266373574733734, "step": 2654 }, { "epoch": 0.05312, "grad_norm": 2.59375, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.8654, "loss/crossentropy": 2.1736810207366943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29696571826934814, "step": 2656 }, { "epoch": 0.05316, "grad_norm": 2.765625, "grad_norm_var": 0.012886555989583333, "learning_rate": 0.0001, "loss": 5.0834, "loss/crossentropy": 2.2366485595703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3062315583229065, "step": 2658 }, { "epoch": 0.0532, "grad_norm": 2.609375, "grad_norm_var": 0.018748982747395834, "learning_rate": 0.0001, "loss": 5.0888, "loss/crossentropy": 1.9835070371627808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.35663366317749023, "step": 2660 }, { "epoch": 0.05324, "grad_norm": 2.4375, "grad_norm_var": 0.026688639322916666, "learning_rate": 0.0001, "loss": 4.6725, "loss/crossentropy": 2.148723840713501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687191218137741, "step": 2662 }, { "epoch": 0.05328, "grad_norm": 2.625, "grad_norm_var": 0.02789306640625, "learning_rate": 0.0001, "loss": 4.9191, "loss/crossentropy": 2.2642128467559814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2738381028175354, "step": 2664 }, { "epoch": 0.05332, "grad_norm": 2.546875, "grad_norm_var": 0.028483072916666668, "learning_rate": 0.0001, "loss": 4.8209, "loss/crossentropy": 1.839052438735962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2566695362329483, "step": 2666 }, { "epoch": 0.05336, "grad_norm": 2.6875, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 4.8771, "loss/crossentropy": 2.083684980869293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2632910907268524, "step": 2668 }, { "epoch": 0.0534, "grad_norm": 2.75, "grad_norm_var": 0.028434244791666667, "learning_rate": 0.0001, "loss": 4.8585, "loss/crossentropy": 2.2125936150550842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31363190710544586, "step": 2670 }, { "epoch": 0.05344, "grad_norm": 2.734375, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 5.0354, "loss/crossentropy": 2.2075263261795044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2850564122200012, "step": 2672 }, { "epoch": 0.05348, "grad_norm": 2.859375, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 5.0145, "loss/crossentropy": 2.0876463651657104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2972792685031891, "step": 2674 }, { "epoch": 0.05352, "grad_norm": 2.84375, "grad_norm_var": 0.020406087239583332, "learning_rate": 0.0001, "loss": 5.2755, "loss/crossentropy": 2.4033172130584717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.304116889834404, "step": 2676 }, { "epoch": 0.05356, "grad_norm": 2.734375, "grad_norm_var": 0.014501953125, "learning_rate": 0.0001, "loss": 5.0921, "loss/crossentropy": 2.30586314201355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3264722675085068, "step": 2678 }, { "epoch": 0.0536, "grad_norm": 2.46875, "grad_norm_var": 0.015152994791666667, "learning_rate": 0.0001, "loss": 5.0941, "loss/crossentropy": 2.2175174951553345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2661540359258652, "step": 2680 }, { "epoch": 0.05364, "grad_norm": 2.921875, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 5.1162, "loss/crossentropy": 2.0583900213241577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26995618641376495, "step": 2682 }, { "epoch": 0.05368, "grad_norm": 2.59375, "grad_norm_var": 0.020173136393229166, "learning_rate": 0.0001, "loss": 4.9357, "loss/crossentropy": 2.310309052467346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29642508924007416, "step": 2684 }, { "epoch": 0.05372, "grad_norm": 3.15625, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 4.7945, "loss/crossentropy": 2.134134352207184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2619224041700363, "step": 2686 }, { "epoch": 0.05376, "grad_norm": 2.609375, "grad_norm_var": 0.03306884765625, "learning_rate": 0.0001, "loss": 4.8411, "loss/crossentropy": 1.930562138557434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27108173072338104, "step": 2688 }, { "epoch": 0.0538, "grad_norm": 2.46875, "grad_norm_var": 0.03455403645833333, "learning_rate": 0.0001, "loss": 4.9591, "loss/crossentropy": 2.3414769172668457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.270420178771019, "step": 2690 }, { "epoch": 0.05384, "grad_norm": 2.78125, "grad_norm_var": 0.0356842041015625, "learning_rate": 0.0001, "loss": 4.83, "loss/crossentropy": 2.1726938486099243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710433751344681, "step": 2692 }, { "epoch": 0.05388, "grad_norm": 2.640625, "grad_norm_var": 0.0361968994140625, "learning_rate": 0.0001, "loss": 5.0163, "loss/crossentropy": 2.220117926597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3212582617998123, "step": 2694 }, { "epoch": 0.05392, "grad_norm": 2.71875, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 5.1845, "loss/crossentropy": 2.2557668685913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3096832036972046, "step": 2696 }, { "epoch": 0.05396, "grad_norm": 2.734375, "grad_norm_var": 0.0246246337890625, "learning_rate": 0.0001, "loss": 5.0399, "loss/crossentropy": 1.94975346326828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2530350238084793, "step": 2698 }, { "epoch": 0.054, "grad_norm": 2.6875, "grad_norm_var": 0.024169921875, "learning_rate": 0.0001, "loss": 5.19, "loss/crossentropy": 2.4622775316238403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3135389983654022, "step": 2700 }, { "epoch": 0.05404, "grad_norm": 2.65625, "grad_norm_var": 0.0110748291015625, "learning_rate": 0.0001, "loss": 5.2005, "loss/crossentropy": 2.5367363691329956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30742934346199036, "step": 2702 }, { "epoch": 0.05408, "grad_norm": 2.5, "grad_norm_var": 0.014631144205729167, "learning_rate": 0.0001, "loss": 5.146, "loss/crossentropy": 2.5733184814453125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33410580456256866, "step": 2704 }, { "epoch": 0.05412, "grad_norm": 2.6875, "grad_norm_var": 0.011725870768229167, "learning_rate": 0.0001, "loss": 4.8888, "loss/crossentropy": 1.9339997172355652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28760699927806854, "step": 2706 }, { "epoch": 0.05416, "grad_norm": 2.484375, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.8719, "loss/crossentropy": 1.8515672087669373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23648252338171005, "step": 2708 }, { "epoch": 0.0542, "grad_norm": 2.546875, "grad_norm_var": 0.014420572916666667, "learning_rate": 0.0001, "loss": 4.6598, "loss/crossentropy": 2.0973429083824158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2605738639831543, "step": 2710 }, { "epoch": 0.05424, "grad_norm": 2.90625, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.2795, "loss/crossentropy": 2.406570076942444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2992263287305832, "step": 2712 }, { "epoch": 0.05428, "grad_norm": 2.53125, "grad_norm_var": 0.024958292643229168, "learning_rate": 0.0001, "loss": 4.8591, "loss/crossentropy": 2.040315330028534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27726222574710846, "step": 2714 }, { "epoch": 0.05432, "grad_norm": 2.40625, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 4.7879, "loss/crossentropy": 2.250051259994507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2764698565006256, "step": 2716 }, { "epoch": 0.05436, "grad_norm": 2.4375, "grad_norm_var": 0.026985677083333333, "learning_rate": 0.0001, "loss": 4.8813, "loss/crossentropy": 2.25112247467041, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3101722151041031, "step": 2718 }, { "epoch": 0.0544, "grad_norm": 2.421875, "grad_norm_var": 0.028709920247395833, "learning_rate": 0.0001, "loss": 4.7242, "loss/crossentropy": 2.261968731880188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2805032432079315, "step": 2720 }, { "epoch": 0.05444, "grad_norm": 2.59375, "grad_norm_var": 0.030078125, "learning_rate": 0.0001, "loss": 5.0449, "loss/crossentropy": 2.376634955406189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25400323420763016, "step": 2722 }, { "epoch": 0.05448, "grad_norm": 2.515625, "grad_norm_var": 0.03351949055989583, "learning_rate": 0.0001, "loss": 5.2325, "loss/crossentropy": 2.61246657371521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3170415759086609, "step": 2724 }, { "epoch": 0.05452, "grad_norm": 2.59375, "grad_norm_var": 0.0296051025390625, "learning_rate": 0.0001, "loss": 5.0433, "loss/crossentropy": 2.3982752561569214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2730572074651718, "step": 2726 }, { "epoch": 0.05456, "grad_norm": 2.609375, "grad_norm_var": 0.0207916259765625, "learning_rate": 0.0001, "loss": 4.8836, "loss/crossentropy": 1.9890516996383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26453813910484314, "step": 2728 }, { "epoch": 0.0546, "grad_norm": 2.4375, "grad_norm_var": 0.016559855143229166, "learning_rate": 0.0001, "loss": 4.7252, "loss/crossentropy": 2.1825047731399536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28369753062725067, "step": 2730 }, { "epoch": 0.05464, "grad_norm": 2.5, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 4.9445, "loss/crossentropy": 1.9745987057685852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2662041634321213, "step": 2732 }, { "epoch": 0.05468, "grad_norm": 2.640625, "grad_norm_var": 0.020783487955729166, "learning_rate": 0.0001, "loss": 4.7591, "loss/crossentropy": 2.2962852716445923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2986691743135452, "step": 2734 }, { "epoch": 0.05472, "grad_norm": 2.65625, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 4.9712, "loss/crossentropy": 2.0517550110816956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27293455600738525, "step": 2736 }, { "epoch": 0.05476, "grad_norm": 2.859375, "grad_norm_var": 0.021434529622395834, "learning_rate": 0.0001, "loss": 5.4052, "loss/crossentropy": 2.327734112739563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3019224554300308, "step": 2738 }, { "epoch": 0.0548, "grad_norm": 2.5, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 4.7002, "loss/crossentropy": 2.236689567565918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2816064953804016, "step": 2740 }, { "epoch": 0.05484, "grad_norm": 2.75, "grad_norm_var": 0.020894368489583332, "learning_rate": 0.0001, "loss": 5.1321, "loss/crossentropy": 2.0209690928459167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2734442874789238, "step": 2742 }, { "epoch": 0.05488, "grad_norm": 2.640625, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 4.9841, "loss/crossentropy": 2.2264864444732666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28523482382297516, "step": 2744 }, { "epoch": 0.05492, "grad_norm": 2.421875, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.8661, "loss/crossentropy": 1.9897980690002441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26623376458883286, "step": 2746 }, { "epoch": 0.05496, "grad_norm": 2.53125, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 4.9299, "loss/crossentropy": 2.0583779215812683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2803298681974411, "step": 2748 }, { "epoch": 0.055, "grad_norm": 2.515625, "grad_norm_var": 0.022001139322916665, "learning_rate": 0.0001, "loss": 4.9818, "loss/crossentropy": 1.8448269367218018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2606130689382553, "step": 2750 }, { "epoch": 0.05504, "grad_norm": 2.640625, "grad_norm_var": 0.02301025390625, "learning_rate": 0.0001, "loss": 5.0885, "loss/crossentropy": 2.294836401939392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29403699934482574, "step": 2752 }, { "epoch": 0.05508, "grad_norm": 2.296875, "grad_norm_var": 0.02808837890625, "learning_rate": 0.0001, "loss": 4.4637, "loss/crossentropy": 2.2402058839797974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28176650404930115, "step": 2754 }, { "epoch": 0.05512, "grad_norm": 2.734375, "grad_norm_var": 0.0298980712890625, "learning_rate": 0.0001, "loss": 4.7805, "loss/crossentropy": 1.7882421612739563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25275079905986786, "step": 2756 }, { "epoch": 0.05516, "grad_norm": 2.8125, "grad_norm_var": 0.02919921875, "learning_rate": 0.0001, "loss": 5.1277, "loss/crossentropy": 2.4185458421707153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30663780868053436, "step": 2758 }, { "epoch": 0.0552, "grad_norm": 2.484375, "grad_norm_var": 0.0318511962890625, "learning_rate": 0.0001, "loss": 4.9259, "loss/crossentropy": 2.2588642835617065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26785464584827423, "step": 2760 }, { "epoch": 0.05524, "grad_norm": 2.484375, "grad_norm_var": 0.034012858072916666, "learning_rate": 0.0001, "loss": 4.8359, "loss/crossentropy": 2.145754337310791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710302472114563, "step": 2762 }, { "epoch": 0.05528, "grad_norm": 2.609375, "grad_norm_var": 0.03648681640625, "learning_rate": 0.0001, "loss": 4.739, "loss/crossentropy": 2.3627569675445557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2926081120967865, "step": 2764 }, { "epoch": 0.05532, "grad_norm": 2.609375, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 4.8512, "loss/crossentropy": 1.988103210926056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24308288842439651, "step": 2766 }, { "epoch": 0.05536, "grad_norm": 2.828125, "grad_norm_var": 0.040445963541666664, "learning_rate": 0.0001, "loss": 5.1075, "loss/crossentropy": 2.2497689723968506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.313574954867363, "step": 2768 }, { "epoch": 0.0554, "grad_norm": 2.609375, "grad_norm_var": 0.030492146809895832, "learning_rate": 0.0001, "loss": 4.8978, "loss/crossentropy": 2.2603683471679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27997657656669617, "step": 2770 }, { "epoch": 0.05544, "grad_norm": 2.796875, "grad_norm_var": 0.030614217122395832, "learning_rate": 0.0001, "loss": 4.9485, "loss/crossentropy": 2.2585065364837646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2818940281867981, "step": 2772 }, { "epoch": 0.05548, "grad_norm": 2.578125, "grad_norm_var": 0.027730305989583332, "learning_rate": 0.0001, "loss": 5.2222, "loss/crossentropy": 2.1413429975509644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2950669527053833, "step": 2774 }, { "epoch": 0.05552, "grad_norm": 2.734375, "grad_norm_var": 0.0171875, "learning_rate": 0.0001, "loss": 5.1567, "loss/crossentropy": 1.9994583129882812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3946940451860428, "step": 2776 }, { "epoch": 0.05556, "grad_norm": 2.796875, "grad_norm_var": 0.014762369791666667, "learning_rate": 0.0001, "loss": 5.2877, "loss/crossentropy": 2.423824667930603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30817069113254547, "step": 2778 }, { "epoch": 0.0556, "grad_norm": 2.4375, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 4.6657, "loss/crossentropy": 1.8579126000404358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24312064796686172, "step": 2780 }, { "epoch": 0.05564, "grad_norm": 2.390625, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 4.7653, "loss/crossentropy": 2.3444939851760864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28466545045375824, "step": 2782 }, { "epoch": 0.05568, "grad_norm": 2.71875, "grad_norm_var": 0.03806864420572917, "learning_rate": 0.0001, "loss": 5.1187, "loss/crossentropy": 2.221144914627075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2694521099328995, "step": 2784 }, { "epoch": 0.05572, "grad_norm": 2.578125, "grad_norm_var": 0.0376861572265625, "learning_rate": 0.0001, "loss": 5.0401, "loss/crossentropy": 1.9372909665107727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2698594778776169, "step": 2786 }, { "epoch": 0.05576, "grad_norm": 2.765625, "grad_norm_var": 0.038834635416666666, "learning_rate": 0.0001, "loss": 4.834, "loss/crossentropy": 2.129204750061035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27333614230155945, "step": 2788 }, { "epoch": 0.0558, "grad_norm": 2.5, "grad_norm_var": 0.04045817057291667, "learning_rate": 0.0001, "loss": 4.8462, "loss/crossentropy": 1.6917370557785034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24290545284748077, "step": 2790 }, { "epoch": 0.05584, "grad_norm": 2.765625, "grad_norm_var": 0.04096577962239583, "learning_rate": 0.0001, "loss": 4.6942, "loss/crossentropy": 1.7883749604225159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2554834187030792, "step": 2792 }, { "epoch": 0.05588, "grad_norm": 2.765625, "grad_norm_var": 0.04560445149739583, "learning_rate": 0.0001, "loss": 4.6835, "loss/crossentropy": 1.867136001586914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667583078145981, "step": 2794 }, { "epoch": 0.05592, "grad_norm": 2.546875, "grad_norm_var": 0.04143473307291667, "learning_rate": 0.0001, "loss": 4.9686, "loss/crossentropy": 2.032800853252411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2580890506505966, "step": 2796 }, { "epoch": 0.05596, "grad_norm": 3.03125, "grad_norm_var": 0.04389546712239583, "learning_rate": 0.0001, "loss": 5.018, "loss/crossentropy": 1.9867863655090332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2694792151451111, "step": 2798 }, { "epoch": 0.056, "grad_norm": 2.59375, "grad_norm_var": 0.026786295572916667, "learning_rate": 0.0001, "loss": 4.9137, "loss/crossentropy": 2.1026757955551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2664051130414009, "step": 2800 }, { "epoch": 0.05604, "grad_norm": 2.65625, "grad_norm_var": 0.0259918212890625, "learning_rate": 0.0001, "loss": 4.9883, "loss/crossentropy": 2.0649160742759705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2473129704594612, "step": 2802 }, { "epoch": 0.05608, "grad_norm": 2.921875, "grad_norm_var": 0.030451456705729168, "learning_rate": 0.0001, "loss": 4.8959, "loss/crossentropy": 2.2110248804092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27131715416908264, "step": 2804 }, { "epoch": 0.05612, "grad_norm": 2.65625, "grad_norm_var": 0.032933553059895836, "learning_rate": 0.0001, "loss": 4.9603, "loss/crossentropy": 2.3483060598373413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27071496844291687, "step": 2806 }, { "epoch": 0.05616, "grad_norm": 2.640625, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 4.9273, "loss/crossentropy": 1.9061944484710693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550426125526428, "step": 2808 }, { "epoch": 0.0562, "grad_norm": 2.671875, "grad_norm_var": 0.0278472900390625, "learning_rate": 0.0001, "loss": 5.1734, "loss/crossentropy": 2.3073103427886963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30051568150520325, "step": 2810 }, { "epoch": 0.05624, "grad_norm": 2.671875, "grad_norm_var": 0.027106730143229167, "learning_rate": 0.0001, "loss": 5.0268, "loss/crossentropy": 2.393891453742981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29226459562778473, "step": 2812 }, { "epoch": 0.05628, "grad_norm": 2.484375, "grad_norm_var": 0.018578084309895833, "learning_rate": 0.0001, "loss": 5.0839, "loss/crossentropy": 2.3082761764526367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29539754986763, "step": 2814 }, { "epoch": 0.05632, "grad_norm": 2.515625, "grad_norm_var": 0.021776326497395835, "learning_rate": 0.0001, "loss": 4.8973, "loss/crossentropy": 2.7815494537353516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33258646726608276, "step": 2816 }, { "epoch": 0.05636, "grad_norm": 2.390625, "grad_norm_var": 0.031538899739583334, "learning_rate": 0.0001, "loss": 4.7095, "loss/crossentropy": 2.077984571456909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.261022225022316, "step": 2818 }, { "epoch": 0.0564, "grad_norm": 2.484375, "grad_norm_var": 0.0251617431640625, "learning_rate": 0.0001, "loss": 4.6389, "loss/crossentropy": 2.0524688363075256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2936979830265045, "step": 2820 }, { "epoch": 0.05644, "grad_norm": 2.5, "grad_norm_var": 0.018212890625, "learning_rate": 0.0001, "loss": 4.9657, "loss/crossentropy": 1.8323140740394592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26426824927330017, "step": 2822 }, { "epoch": 0.05648, "grad_norm": 2.765625, "grad_norm_var": 0.0201171875, "learning_rate": 0.0001, "loss": 4.9513, "loss/crossentropy": 2.2290462255477905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28852027654647827, "step": 2824 }, { "epoch": 0.05652, "grad_norm": 2.5625, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 4.7828, "loss/crossentropy": 1.8788678050041199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24312467128038406, "step": 2826 }, { "epoch": 0.05656, "grad_norm": 2.46875, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 4.7882, "loss/crossentropy": 1.9402090311050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25435876101255417, "step": 2828 }, { "epoch": 0.0566, "grad_norm": 2.390625, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 4.5823, "loss/crossentropy": 2.4833847284317017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27234241366386414, "step": 2830 }, { "epoch": 0.05664, "grad_norm": 2.515625, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 5.0521, "loss/crossentropy": 2.351561665534973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28028184175491333, "step": 2832 }, { "epoch": 0.05668, "grad_norm": 2.953125, "grad_norm_var": 0.019722493489583333, "learning_rate": 0.0001, "loss": 5.0688, "loss/crossentropy": 2.0372352600097656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26005299389362335, "step": 2834 }, { "epoch": 0.05672, "grad_norm": 2.828125, "grad_norm_var": 0.021744791666666666, "learning_rate": 0.0001, "loss": 4.4282, "loss/crossentropy": 1.6522082090377808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26683834940195084, "step": 2836 }, { "epoch": 0.05676, "grad_norm": 2.390625, "grad_norm_var": 0.023778279622395832, "learning_rate": 0.0001, "loss": 4.7165, "loss/crossentropy": 1.8886643052101135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23464814573526382, "step": 2838 }, { "epoch": 0.0568, "grad_norm": 2.734375, "grad_norm_var": 0.023119099934895835, "learning_rate": 0.0001, "loss": 4.9477, "loss/crossentropy": 2.2161459922790527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29381541907787323, "step": 2840 }, { "epoch": 0.05684, "grad_norm": 2.421875, "grad_norm_var": 0.024779256184895834, "learning_rate": 0.0001, "loss": 4.7408, "loss/crossentropy": 2.1542043685913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2753119468688965, "step": 2842 }, { "epoch": 0.05688, "grad_norm": 2.78125, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 5.1288, "loss/crossentropy": 2.5605628490448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.329460546374321, "step": 2844 }, { "epoch": 0.05692, "grad_norm": 2.609375, "grad_norm_var": 0.023224894205729166, "learning_rate": 0.0001, "loss": 4.9955, "loss/crossentropy": 2.1000319719314575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.273986279964447, "step": 2846 }, { "epoch": 0.05696, "grad_norm": 2.484375, "grad_norm_var": 0.023558553059895834, "learning_rate": 0.0001, "loss": 4.6813, "loss/crossentropy": 2.1036760210990906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26293954253196716, "step": 2848 }, { "epoch": 0.057, "grad_norm": 2.640625, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 4.9799, "loss/crossentropy": 2.2130608558654785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2977828085422516, "step": 2850 }, { "epoch": 0.05704, "grad_norm": 2.578125, "grad_norm_var": 0.01461181640625, "learning_rate": 0.0001, "loss": 5.1761, "loss/crossentropy": 2.1878823041915894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27322643995285034, "step": 2852 }, { "epoch": 0.05708, "grad_norm": 2.703125, "grad_norm_var": 0.012547810872395834, "learning_rate": 0.0001, "loss": 4.9448, "loss/crossentropy": 2.1559258699417114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32625503838062286, "step": 2854 }, { "epoch": 0.05712, "grad_norm": 2.390625, "grad_norm_var": 0.015848795572916668, "learning_rate": 0.0001, "loss": 4.4611, "loss/crossentropy": 2.0818992257118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2677721679210663, "step": 2856 }, { "epoch": 0.05716, "grad_norm": 2.578125, "grad_norm_var": 0.016243489583333333, "learning_rate": 0.0001, "loss": 5.2755, "loss/crossentropy": 2.544907331466675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30028045177459717, "step": 2858 }, { "epoch": 0.0572, "grad_norm": 3.203125, "grad_norm_var": 0.04491780598958333, "learning_rate": 0.0001, "loss": 5.2855, "loss/crossentropy": 2.5932188034057617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30431173741817474, "step": 2860 }, { "epoch": 0.05724, "grad_norm": 2.453125, "grad_norm_var": 0.046219889322916666, "learning_rate": 0.0001, "loss": 4.9799, "loss/crossentropy": 2.1007986068725586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2989690601825714, "step": 2862 }, { "epoch": 0.05728, "grad_norm": 2.5625, "grad_norm_var": 0.044169108072916664, "learning_rate": 0.0001, "loss": 4.7706, "loss/crossentropy": 2.0102875232696533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26936179399490356, "step": 2864 }, { "epoch": 0.05732, "grad_norm": 3.109375, "grad_norm_var": 0.056559244791666664, "learning_rate": 0.0001, "loss": 5.2099, "loss/crossentropy": 2.3457159996032715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29194609820842743, "step": 2866 }, { "epoch": 0.05736, "grad_norm": 2.625, "grad_norm_var": 0.055582682291666664, "learning_rate": 0.0001, "loss": 4.8768, "loss/crossentropy": 2.559054732322693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3018783777952194, "step": 2868 }, { "epoch": 0.0574, "grad_norm": 2.703125, "grad_norm_var": 0.05831705729166667, "learning_rate": 0.0001, "loss": 4.9863, "loss/crossentropy": 2.0641059279441833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2789234220981598, "step": 2870 }, { "epoch": 0.05744, "grad_norm": 2.734375, "grad_norm_var": 0.04683837890625, "learning_rate": 0.0001, "loss": 4.9222, "loss/crossentropy": 2.0819749236106873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26825186610221863, "step": 2872 }, { "epoch": 0.05748, "grad_norm": 2.703125, "grad_norm_var": 0.046263631184895834, "learning_rate": 0.0001, "loss": 4.9586, "loss/crossentropy": 2.1114020347595215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613547742366791, "step": 2874 }, { "epoch": 0.05752, "grad_norm": 2.4375, "grad_norm_var": 0.02916259765625, "learning_rate": 0.0001, "loss": 4.9474, "loss/crossentropy": 2.174915075302124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29503974318504333, "step": 2876 }, { "epoch": 0.05756, "grad_norm": 2.6875, "grad_norm_var": 0.0265289306640625, "learning_rate": 0.0001, "loss": 5.0541, "loss/crossentropy": 2.4342113733291626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30623678863048553, "step": 2878 }, { "epoch": 0.0576, "grad_norm": 2.703125, "grad_norm_var": 0.024918619791666666, "learning_rate": 0.0001, "loss": 5.0316, "loss/crossentropy": 2.0518307089805603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28422991931438446, "step": 2880 }, { "epoch": 0.05764, "grad_norm": 2.6875, "grad_norm_var": 0.011921183268229166, "learning_rate": 0.0001, "loss": 5.0806, "loss/crossentropy": 2.5378612279891968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29730531573295593, "step": 2882 }, { "epoch": 0.05768, "grad_norm": 2.5, "grad_norm_var": 0.014972941080729166, "learning_rate": 0.0001, "loss": 4.8685, "loss/crossentropy": 2.2670027017593384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26347288489341736, "step": 2884 }, { "epoch": 0.05772, "grad_norm": 3.671875, "grad_norm_var": 0.07822265625, "learning_rate": 0.0001, "loss": 5.0385, "loss/crossentropy": 2.351823568344116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3020637035369873, "step": 2886 }, { "epoch": 0.05776, "grad_norm": 2.65625, "grad_norm_var": 0.0792633056640625, "learning_rate": 0.0001, "loss": 4.9699, "loss/crossentropy": 2.190839111804962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2639586254954338, "step": 2888 }, { "epoch": 0.0578, "grad_norm": 2.3125, "grad_norm_var": 0.08765869140625, "learning_rate": 0.0001, "loss": 4.6882, "loss/crossentropy": 2.148400902748108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27762140333652496, "step": 2890 }, { "epoch": 0.05784, "grad_norm": 2.984375, "grad_norm_var": 0.09696858723958333, "learning_rate": 0.0001, "loss": 5.1033, "loss/crossentropy": 2.01130074262619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2678648605942726, "step": 2892 }, { "epoch": 0.05788, "grad_norm": 2.75, "grad_norm_var": 0.09986572265625, "learning_rate": 0.0001, "loss": 5.0829, "loss/crossentropy": 2.2239269018173218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2971164286136627, "step": 2894 }, { "epoch": 0.05792, "grad_norm": 2.328125, "grad_norm_var": 0.10974934895833334, "learning_rate": 0.0001, "loss": 4.48, "loss/crossentropy": 1.9573850631713867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.241651751101017, "step": 2896 }, { "epoch": 0.05796, "grad_norm": 2.5, "grad_norm_var": 0.11169331868489583, "learning_rate": 0.0001, "loss": 4.8806, "loss/crossentropy": 1.9522782564163208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2685594707727432, "step": 2898 }, { "epoch": 0.058, "grad_norm": 2.671875, "grad_norm_var": 0.10886128743489583, "learning_rate": 0.0001, "loss": 4.9335, "loss/crossentropy": 2.1742069721221924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27867090702056885, "step": 2900 }, { "epoch": 0.05804, "grad_norm": 5.15625, "grad_norm_var": 0.44882405598958336, "learning_rate": 0.0001, "loss": 4.9164, "loss/crossentropy": 2.201782703399658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2731190174818039, "step": 2902 }, { "epoch": 0.05808, "grad_norm": 2.734375, "grad_norm_var": 0.4452301025390625, "learning_rate": 0.0001, "loss": 4.9787, "loss/crossentropy": 2.159709095954895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2940017879009247, "step": 2904 }, { "epoch": 0.05812, "grad_norm": 2.71875, "grad_norm_var": 0.4297271728515625, "learning_rate": 0.0001, "loss": 4.9192, "loss/crossentropy": 2.1989885568618774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2776087671518326, "step": 2906 }, { "epoch": 0.05816, "grad_norm": 2.546875, "grad_norm_var": 0.42688395182291666, "learning_rate": 0.0001, "loss": 4.8851, "loss/crossentropy": 1.9634575247764587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27917809784412384, "step": 2908 }, { "epoch": 0.0582, "grad_norm": 2.390625, "grad_norm_var": 0.43651936848958334, "learning_rate": 0.0001, "loss": 4.7776, "loss/crossentropy": 2.1553479433059692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28876082599163055, "step": 2910 }, { "epoch": 0.05824, "grad_norm": 2.515625, "grad_norm_var": 0.42451883951822916, "learning_rate": 0.0001, "loss": 4.881, "loss/crossentropy": 2.1035598516464233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2851293236017227, "step": 2912 }, { "epoch": 0.05828, "grad_norm": 2.890625, "grad_norm_var": 0.42097880045572916, "learning_rate": 0.0001, "loss": 5.2733, "loss/crossentropy": 2.17076575756073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28258734941482544, "step": 2914 }, { "epoch": 0.05832, "grad_norm": 2.484375, "grad_norm_var": 0.42477213541666664, "learning_rate": 0.0001, "loss": 4.9147, "loss/crossentropy": 2.2914711236953735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29451698064804077, "step": 2916 }, { "epoch": 0.05836, "grad_norm": 2.703125, "grad_norm_var": 0.021410115559895835, "learning_rate": 0.0001, "loss": 5.1395, "loss/crossentropy": 2.554638981819153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30992935597896576, "step": 2918 }, { "epoch": 0.0584, "grad_norm": 2.546875, "grad_norm_var": 0.05735575358072917, "learning_rate": 0.0001, "loss": 4.6262, "loss/crossentropy": 1.808376431465149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22574126720428467, "step": 2920 }, { "epoch": 0.05844, "grad_norm": 2.78125, "grad_norm_var": 0.05660400390625, "learning_rate": 0.0001, "loss": 5.1636, "loss/crossentropy": 2.1471784114837646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27325738966464996, "step": 2922 }, { "epoch": 0.05848, "grad_norm": 2.765625, "grad_norm_var": 0.054182942708333334, "learning_rate": 0.0001, "loss": 5.071, "loss/crossentropy": 2.2175731658935547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3013547956943512, "step": 2924 }, { "epoch": 0.05852, "grad_norm": 2.953125, "grad_norm_var": 0.04949544270833333, "learning_rate": 0.0001, "loss": 5.467, "loss/crossentropy": 2.369373917579651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29249751567840576, "step": 2926 }, { "epoch": 0.05856, "grad_norm": 2.453125, "grad_norm_var": 0.054915364583333334, "learning_rate": 0.0001, "loss": 4.8778, "loss/crossentropy": 2.1758522987365723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28846102952957153, "step": 2928 }, { "epoch": 0.0586, "grad_norm": 2.546875, "grad_norm_var": 0.05413411458333333, "learning_rate": 0.0001, "loss": 4.929, "loss/crossentropy": 2.46218478679657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2824682295322418, "step": 2930 }, { "epoch": 0.05864, "grad_norm": 2.40625, "grad_norm_var": 0.0606597900390625, "learning_rate": 0.0001, "loss": 4.5557, "loss/crossentropy": 2.058937907218933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26767465472221375, "step": 2932 }, { "epoch": 0.05868, "grad_norm": 2.546875, "grad_norm_var": 0.06109619140625, "learning_rate": 0.0001, "loss": 4.9103, "loss/crossentropy": 2.312442421913147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2952658236026764, "step": 2934 }, { "epoch": 0.05872, "grad_norm": 2.71875, "grad_norm_var": 0.026911417643229168, "learning_rate": 0.0001, "loss": 5.1452, "loss/crossentropy": 2.18839955329895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31788623332977295, "step": 2936 }, { "epoch": 0.05876, "grad_norm": 2.671875, "grad_norm_var": 0.025537109375, "learning_rate": 0.0001, "loss": 5.1358, "loss/crossentropy": 2.1330811977386475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28829698264598846, "step": 2938 }, { "epoch": 0.0588, "grad_norm": 2.9375, "grad_norm_var": 0.0294586181640625, "learning_rate": 0.0001, "loss": 5.1071, "loss/crossentropy": 2.124038338661194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2708826810121536, "step": 2940 }, { "epoch": 0.05884, "grad_norm": 2.53125, "grad_norm_var": 0.024312337239583332, "learning_rate": 0.0001, "loss": 4.8479, "loss/crossentropy": 2.1934449076652527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27314090728759766, "step": 2942 }, { "epoch": 0.05888, "grad_norm": 2.6875, "grad_norm_var": 0.024535115559895834, "learning_rate": 0.0001, "loss": 4.9904, "loss/crossentropy": 1.967636525630951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23761005699634552, "step": 2944 }, { "epoch": 0.05892, "grad_norm": 3.015625, "grad_norm_var": 0.034821573893229166, "learning_rate": 0.0001, "loss": 4.9214, "loss/crossentropy": 2.2380826473236084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3402934819459915, "step": 2946 }, { "epoch": 0.05896, "grad_norm": 2.5625, "grad_norm_var": 0.0299468994140625, "learning_rate": 0.0001, "loss": 4.7702, "loss/crossentropy": 2.227039933204651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27934837341308594, "step": 2948 }, { "epoch": 0.059, "grad_norm": 2.6875, "grad_norm_var": 0.03258056640625, "learning_rate": 0.0001, "loss": 4.789, "loss/crossentropy": 1.999170958995819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.275806725025177, "step": 2950 }, { "epoch": 0.05904, "grad_norm": 2.546875, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 4.5754, "loss/crossentropy": 1.8843520879745483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26203446090221405, "step": 2952 }, { "epoch": 0.05908, "grad_norm": 2.5, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 4.5485, "loss/crossentropy": 2.0742241740226746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28777699172496796, "step": 2954 }, { "epoch": 0.05912, "grad_norm": 2.6875, "grad_norm_var": 0.028206380208333333, "learning_rate": 0.0001, "loss": 4.9676, "loss/crossentropy": 2.2272751331329346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27761510014533997, "step": 2956 }, { "epoch": 0.05916, "grad_norm": 2.734375, "grad_norm_var": 0.029878743489583335, "learning_rate": 0.0001, "loss": 4.6329, "loss/crossentropy": 2.2758638858795166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28144824504852295, "step": 2958 }, { "epoch": 0.0592, "grad_norm": 2.46875, "grad_norm_var": 0.060530598958333334, "learning_rate": 0.0001, "loss": 4.9543, "loss/crossentropy": 2.245158016681671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3030128926038742, "step": 2960 }, { "epoch": 0.05924, "grad_norm": 2.453125, "grad_norm_var": 0.053498331705729166, "learning_rate": 0.0001, "loss": 4.567, "loss/crossentropy": 2.3337708711624146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29228493571281433, "step": 2962 }, { "epoch": 0.05928, "grad_norm": 2.5, "grad_norm_var": 0.05788472493489583, "learning_rate": 0.0001, "loss": 4.6477, "loss/crossentropy": 2.3765406608581543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2715871036052704, "step": 2964 }, { "epoch": 0.05932, "grad_norm": 2.375, "grad_norm_var": 0.0562652587890625, "learning_rate": 0.0001, "loss": 4.8649, "loss/crossentropy": 2.090362787246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505037933588028, "step": 2966 }, { "epoch": 0.05936, "grad_norm": 2.453125, "grad_norm_var": 0.056396484375, "learning_rate": 0.0001, "loss": 4.8669, "loss/crossentropy": 2.011539399623871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27219071984291077, "step": 2968 }, { "epoch": 0.0594, "grad_norm": 2.734375, "grad_norm_var": 0.05693359375, "learning_rate": 0.0001, "loss": 5.2081, "loss/crossentropy": 2.1791563034057617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2796429842710495, "step": 2970 }, { "epoch": 0.05944, "grad_norm": 2.65625, "grad_norm_var": 0.05869852701822917, "learning_rate": 0.0001, "loss": 5.2689, "loss/crossentropy": 2.4645297527313232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3106658458709717, "step": 2972 }, { "epoch": 0.05948, "grad_norm": 7.0, "grad_norm_var": 1.25670166015625, "learning_rate": 0.0001, "loss": 5.0715, "loss/crossentropy": 2.2050880193710327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26860976219177246, "step": 2974 }, { "epoch": 0.05952, "grad_norm": 2.828125, "grad_norm_var": 1.236034138997396, "learning_rate": 0.0001, "loss": 4.5815, "loss/crossentropy": 2.0141921639442444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27980539202690125, "step": 2976 }, { "epoch": 0.05956, "grad_norm": 4.4375, "grad_norm_var": 1.3665924072265625, "learning_rate": 0.0001, "loss": 4.8067, "loss/crossentropy": 1.9399088025093079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26379524916410446, "step": 2978 }, { "epoch": 0.0596, "grad_norm": 2.796875, "grad_norm_var": 1.3243886311848958, "learning_rate": 0.0001, "loss": 5.1743, "loss/crossentropy": 2.5415157079696655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.291620597243309, "step": 2980 }, { "epoch": 0.05964, "grad_norm": 2.5625, "grad_norm_var": 1.3170562744140626, "learning_rate": 0.0001, "loss": 5.0695, "loss/crossentropy": 2.1215542554855347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26189403235912323, "step": 2982 }, { "epoch": 0.05968, "grad_norm": 2.578125, "grad_norm_var": 1.32301025390625, "learning_rate": 0.0001, "loss": 5.0049, "loss/crossentropy": 1.7749422788619995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27546317875385284, "step": 2984 }, { "epoch": 0.05972, "grad_norm": 2.59375, "grad_norm_var": 1.3371490478515624, "learning_rate": 0.0001, "loss": 4.8331, "loss/crossentropy": 2.3383208513259888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2790983319282532, "step": 2986 }, { "epoch": 0.05976, "grad_norm": 2.5625, "grad_norm_var": 1.35230712890625, "learning_rate": 0.0001, "loss": 4.8656, "loss/crossentropy": 2.3688780069351196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752939611673355, "step": 2988 }, { "epoch": 0.0598, "grad_norm": 2.6875, "grad_norm_var": 0.22967020670572916, "learning_rate": 0.0001, "loss": 4.7299, "loss/crossentropy": 2.261468529701233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2725592106580734, "step": 2990 }, { "epoch": 0.05984, "grad_norm": 2.671875, "grad_norm_var": 0.2226226806640625, "learning_rate": 0.0001, "loss": 4.7417, "loss/crossentropy": 2.0632028579711914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2617062032222748, "step": 2992 }, { "epoch": 0.05988, "grad_norm": 2.25, "grad_norm_var": 0.022151692708333334, "learning_rate": 0.0001, "loss": 4.4282, "loss/crossentropy": 2.1409813165664673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2657589614391327, "step": 2994 }, { "epoch": 0.05992, "grad_norm": 2.921875, "grad_norm_var": 0.0258941650390625, "learning_rate": 0.0001, "loss": 5.1816, "loss/crossentropy": 2.431404948234558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3087555915117264, "step": 2996 }, { "epoch": 0.05996, "grad_norm": 2.390625, "grad_norm_var": 0.029743448893229166, "learning_rate": 0.0001, "loss": 4.8253, "loss/crossentropy": 2.165238618850708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29382775723934174, "step": 2998 }, { "epoch": 0.06, "grad_norm": 2.546875, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 4.6328, "loss/crossentropy": 1.9987847208976746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24466252326965332, "step": 3000 }, { "epoch": 0.06004, "grad_norm": 2.203125, "grad_norm_var": 0.04084370930989583, "learning_rate": 0.0001, "loss": 4.5474, "loss/crossentropy": 2.1153565049171448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435970976948738, "step": 3002 }, { "epoch": 0.06008, "grad_norm": 2.3125, "grad_norm_var": 0.046402994791666666, "learning_rate": 0.0001, "loss": 4.7765, "loss/crossentropy": 1.8660866618156433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24413185566663742, "step": 3004 }, { "epoch": 0.06012, "grad_norm": 2.40625, "grad_norm_var": 0.04309794108072917, "learning_rate": 0.0001, "loss": 4.7476, "loss/crossentropy": 2.1816678047180176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28010787069797516, "step": 3006 }, { "epoch": 0.06016, "grad_norm": 2.53125, "grad_norm_var": 0.040827433268229164, "learning_rate": 0.0001, "loss": 4.5995, "loss/crossentropy": 2.1673622131347656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2659531533718109, "step": 3008 }, { "epoch": 0.0602, "grad_norm": 2.78125, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 5.2466, "loss/crossentropy": 2.277818202972412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2929365783929825, "step": 3010 }, { "epoch": 0.06024, "grad_norm": 2.65625, "grad_norm_var": 0.0284576416015625, "learning_rate": 0.0001, "loss": 4.8904, "loss/crossentropy": 2.294926404953003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2753874659538269, "step": 3012 }, { "epoch": 0.06028, "grad_norm": 2.359375, "grad_norm_var": 0.028173828125, "learning_rate": 0.0001, "loss": 4.735, "loss/crossentropy": 2.0222257375717163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27295801043510437, "step": 3014 }, { "epoch": 0.06032, "grad_norm": 2.484375, "grad_norm_var": 0.023628743489583333, "learning_rate": 0.0001, "loss": 4.818, "loss/crossentropy": 2.43736469745636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2934436649084091, "step": 3016 }, { "epoch": 0.06036, "grad_norm": 2.703125, "grad_norm_var": 0.020426432291666668, "learning_rate": 0.0001, "loss": 5.0053, "loss/crossentropy": 2.027767241001129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26353244483470917, "step": 3018 }, { "epoch": 0.0604, "grad_norm": 2.59375, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 4.9059, "loss/crossentropy": 2.150290071964264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27290941774845123, "step": 3020 }, { "epoch": 0.06044, "grad_norm": 2.421875, "grad_norm_var": 0.014697265625, "learning_rate": 0.0001, "loss": 4.8307, "loss/crossentropy": 1.8709848523139954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25272610783576965, "step": 3022 }, { "epoch": 0.06048, "grad_norm": 2.46875, "grad_norm_var": 0.015949503580729166, "learning_rate": 0.0001, "loss": 5.0166, "loss/crossentropy": 2.2170883417129517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29401010274887085, "step": 3024 }, { "epoch": 0.06052, "grad_norm": 2.53125, "grad_norm_var": 0.0109375, "learning_rate": 0.0001, "loss": 4.6221, "loss/crossentropy": 1.728318691253662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24246910959482193, "step": 3026 }, { "epoch": 0.06056, "grad_norm": 2.34375, "grad_norm_var": 0.0095123291015625, "learning_rate": 0.0001, "loss": 4.5597, "loss/crossentropy": 1.8453290462493896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24357211589813232, "step": 3028 }, { "epoch": 0.0606, "grad_norm": 2.640625, "grad_norm_var": 0.010456339518229166, "learning_rate": 0.0001, "loss": 4.6617, "loss/crossentropy": 2.273196220397949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29239149391651154, "step": 3030 }, { "epoch": 0.06064, "grad_norm": 2.484375, "grad_norm_var": 0.012165323893229166, "learning_rate": 0.0001, "loss": 4.9464, "loss/crossentropy": 2.031871259212494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2604901194572449, "step": 3032 }, { "epoch": 0.06068, "grad_norm": 2.515625, "grad_norm_var": 0.008915201822916666, "learning_rate": 0.0001, "loss": 5.1618, "loss/crossentropy": 2.1781771183013916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2853371948003769, "step": 3034 }, { "epoch": 0.06072, "grad_norm": 2.484375, "grad_norm_var": 0.006590779622395833, "learning_rate": 0.0001, "loss": 4.816, "loss/crossentropy": 1.9578353762626648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23352904617786407, "step": 3036 }, { "epoch": 0.06076, "grad_norm": 2.671875, "grad_norm_var": 0.012516276041666666, "learning_rate": 0.0001, "loss": 5.3012, "loss/crossentropy": 2.470985770225525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28890977799892426, "step": 3038 }, { "epoch": 0.0608, "grad_norm": 2.65625, "grad_norm_var": 0.013361612955729166, "learning_rate": 0.0001, "loss": 4.9243, "loss/crossentropy": 2.056231141090393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2963344603776932, "step": 3040 }, { "epoch": 0.06084, "grad_norm": 2.609375, "grad_norm_var": 0.013646443684895834, "learning_rate": 0.0001, "loss": 4.8612, "loss/crossentropy": 2.000037968158722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27152783423662186, "step": 3042 }, { "epoch": 0.06088, "grad_norm": 2.59375, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 4.8576, "loss/crossentropy": 2.323809027671814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29995349049568176, "step": 3044 }, { "epoch": 0.06092, "grad_norm": 2.4375, "grad_norm_var": 0.010407511393229167, "learning_rate": 0.0001, "loss": 5.0094, "loss/crossentropy": 2.06933856010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26831966638565063, "step": 3046 }, { "epoch": 0.06096, "grad_norm": 2.53125, "grad_norm_var": 0.008463541666666666, "learning_rate": 0.0001, "loss": 5.042, "loss/crossentropy": 2.1903880834579468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2833031117916107, "step": 3048 }, { "epoch": 0.061, "grad_norm": 2.46875, "grad_norm_var": 0.008893839518229167, "learning_rate": 0.0001, "loss": 4.9557, "loss/crossentropy": 1.910680890083313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2708408683538437, "step": 3050 }, { "epoch": 0.06104, "grad_norm": 2.546875, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 4.8902, "loss/crossentropy": 2.5203059911727905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2981104403734207, "step": 3052 }, { "epoch": 0.06108, "grad_norm": 2.5, "grad_norm_var": 0.00611572265625, "learning_rate": 0.0001, "loss": 4.68, "loss/crossentropy": 1.7918179035186768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2462354451417923, "step": 3054 }, { "epoch": 0.06112, "grad_norm": 2.65625, "grad_norm_var": 0.006441243489583333, "learning_rate": 0.0001, "loss": 4.9908, "loss/crossentropy": 2.185121774673462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3046490252017975, "step": 3056 }, { "epoch": 0.06116, "grad_norm": 2.28125, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 4.8041, "loss/crossentropy": 1.859390914440155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523205131292343, "step": 3058 }, { "epoch": 0.0612, "grad_norm": 2.40625, "grad_norm_var": 0.011930338541666667, "learning_rate": 0.0001, "loss": 4.7388, "loss/crossentropy": 1.9202255606651306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26971636712551117, "step": 3060 }, { "epoch": 0.06124, "grad_norm": 2.578125, "grad_norm_var": 0.011481730143229167, "learning_rate": 0.0001, "loss": 4.7421, "loss/crossentropy": 2.059127449989319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25721821188926697, "step": 3062 }, { "epoch": 0.06128, "grad_norm": 2.6875, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 5.1392, "loss/crossentropy": 2.3587669134140015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30162671208381653, "step": 3064 }, { "epoch": 0.06132, "grad_norm": 2.59375, "grad_norm_var": 0.018050130208333334, "learning_rate": 0.0001, "loss": 5.2748, "loss/crossentropy": 2.239704966545105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713697552680969, "step": 3066 }, { "epoch": 0.06136, "grad_norm": 2.796875, "grad_norm_var": 0.02076416015625, "learning_rate": 0.0001, "loss": 5.0534, "loss/crossentropy": 2.230944514274597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27418144047260284, "step": 3068 }, { "epoch": 0.0614, "grad_norm": 2.515625, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 4.9145, "loss/crossentropy": 2.196586310863495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24946682900190353, "step": 3070 }, { "epoch": 0.06144, "grad_norm": 2.65625, "grad_norm_var": 0.019873046875, "learning_rate": 0.0001, "loss": 5.058, "loss/crossentropy": 2.115275800228119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2668849602341652, "step": 3072 }, { "epoch": 0.06148, "grad_norm": 2.765625, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 5.1311, "loss/crossentropy": 2.0227994322776794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2587117701768875, "step": 3074 }, { "epoch": 0.06152, "grad_norm": 2.421875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 4.711, "loss/crossentropy": 2.231368660926819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2621329501271248, "step": 3076 }, { "epoch": 0.06156, "grad_norm": 2.421875, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 4.6707, "loss/crossentropy": 2.469847083091736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2758704572916031, "step": 3078 }, { "epoch": 0.0616, "grad_norm": 2.5, "grad_norm_var": 0.01666259765625, "learning_rate": 0.0001, "loss": 4.5753, "loss/crossentropy": 2.257850766181946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28196004033088684, "step": 3080 }, { "epoch": 0.06164, "grad_norm": 2.46875, "grad_norm_var": 0.014969889322916667, "learning_rate": 0.0001, "loss": 4.8783, "loss/crossentropy": 2.52754008769989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.301498681306839, "step": 3082 }, { "epoch": 0.06168, "grad_norm": 2.484375, "grad_norm_var": 0.012279256184895834, "learning_rate": 0.0001, "loss": 4.6595, "loss/crossentropy": 2.120129644870758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30073782801628113, "step": 3084 }, { "epoch": 0.06172, "grad_norm": 2.546875, "grad_norm_var": 0.0231353759765625, "learning_rate": 0.0001, "loss": 5.017, "loss/crossentropy": 2.1483632922172546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29400819540023804, "step": 3086 }, { "epoch": 0.06176, "grad_norm": 2.59375, "grad_norm_var": 0.022297159830729166, "learning_rate": 0.0001, "loss": 5.2447, "loss/crossentropy": 2.5588048696517944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3099561035633087, "step": 3088 }, { "epoch": 0.0618, "grad_norm": 2.359375, "grad_norm_var": 0.019852701822916666, "learning_rate": 0.0001, "loss": 4.5507, "loss/crossentropy": 2.338167190551758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638262137770653, "step": 3090 }, { "epoch": 0.06184, "grad_norm": 2.5, "grad_norm_var": 0.019603474934895834, "learning_rate": 0.0001, "loss": 5.0471, "loss/crossentropy": 2.61361825466156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29420773684978485, "step": 3092 }, { "epoch": 0.06188, "grad_norm": 2.5625, "grad_norm_var": 0.019108072916666666, "learning_rate": 0.0001, "loss": 4.6931, "loss/crossentropy": 2.1078842878341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27283959090709686, "step": 3094 }, { "epoch": 0.06192, "grad_norm": 2.296875, "grad_norm_var": 0.023958333333333335, "learning_rate": 0.0001, "loss": 4.463, "loss/crossentropy": 2.1501123905181885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3052368611097336, "step": 3096 }, { "epoch": 0.06196, "grad_norm": 2.671875, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 5.0482, "loss/crossentropy": 2.3622714281082153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2851791977882385, "step": 3098 }, { "epoch": 0.062, "grad_norm": 2.671875, "grad_norm_var": 0.05203450520833333, "learning_rate": 0.0001, "loss": 4.3506, "loss/crossentropy": 1.9757406115531921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24451570957899094, "step": 3100 }, { "epoch": 0.06204, "grad_norm": 2.859375, "grad_norm_var": 0.07011311848958333, "learning_rate": 0.0001, "loss": 5.0409, "loss/crossentropy": 2.1915838718414307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2749434858560562, "step": 3102 }, { "epoch": 0.06208, "grad_norm": 2.78125, "grad_norm_var": 0.07214253743489583, "learning_rate": 0.0001, "loss": 4.8408, "loss/crossentropy": 2.130649447441101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2590179592370987, "step": 3104 }, { "epoch": 0.06212, "grad_norm": 2.59375, "grad_norm_var": 0.06317952473958334, "learning_rate": 0.0001, "loss": 4.9083, "loss/crossentropy": 2.4478825330734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30854369699954987, "step": 3106 }, { "epoch": 0.06216, "grad_norm": 2.578125, "grad_norm_var": 0.06524149576822917, "learning_rate": 0.0001, "loss": 4.8249, "loss/crossentropy": 1.7108858227729797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22502756118774414, "step": 3108 }, { "epoch": 0.0622, "grad_norm": 2.46875, "grad_norm_var": 0.06788736979166667, "learning_rate": 0.0001, "loss": 4.5355, "loss/crossentropy": 1.79804128408432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24767793715000153, "step": 3110 }, { "epoch": 0.06224, "grad_norm": 3.28125, "grad_norm_var": 0.07480061848958333, "learning_rate": 0.0001, "loss": 4.8779, "loss/crossentropy": 1.899846076965332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23476070165634155, "step": 3112 }, { "epoch": 0.06228, "grad_norm": 2.453125, "grad_norm_var": 0.0790679931640625, "learning_rate": 0.0001, "loss": 4.7506, "loss/crossentropy": 1.8458155393600464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24417708814144135, "step": 3114 }, { "epoch": 0.06232, "grad_norm": 2.5, "grad_norm_var": 0.06979166666666667, "learning_rate": 0.0001, "loss": 4.814, "loss/crossentropy": 1.947302520275116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2566901594400406, "step": 3116 }, { "epoch": 0.06236, "grad_norm": 2.546875, "grad_norm_var": 0.046923828125, "learning_rate": 0.0001, "loss": 4.8424, "loss/crossentropy": 2.039161205291748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26955537497997284, "step": 3118 }, { "epoch": 0.0624, "grad_norm": 2.65625, "grad_norm_var": 0.04488016764322917, "learning_rate": 0.0001, "loss": 5.1656, "loss/crossentropy": 2.1528661251068115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.291052982211113, "step": 3120 }, { "epoch": 0.06244, "grad_norm": 2.640625, "grad_norm_var": 0.04644775390625, "learning_rate": 0.0001, "loss": 4.7941, "loss/crossentropy": 2.4113996028900146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2962254136800766, "step": 3122 }, { "epoch": 0.06248, "grad_norm": 2.65625, "grad_norm_var": 0.044722493489583334, "learning_rate": 0.0001, "loss": 4.8463, "loss/crossentropy": 2.0718825459480286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24676042795181274, "step": 3124 }, { "epoch": 0.06252, "grad_norm": 2.546875, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 4.9723, "loss/crossentropy": 2.433539032936096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.263284370303154, "step": 3126 }, { "epoch": 0.06256, "grad_norm": 2.421875, "grad_norm_var": 0.0087799072265625, "learning_rate": 0.0001, "loss": 4.7688, "loss/crossentropy": 2.047150671482086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655043303966522, "step": 3128 }, { "epoch": 0.0626, "grad_norm": 2.546875, "grad_norm_var": 0.008219401041666666, "learning_rate": 0.0001, "loss": 4.9257, "loss/crossentropy": 1.8897674679756165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314881831407547, "step": 3130 }, { "epoch": 0.06264, "grad_norm": 2.390625, "grad_norm_var": 0.00797119140625, "learning_rate": 0.0001, "loss": 4.8053, "loss/crossentropy": 2.1436809301376343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26931750774383545, "step": 3132 }, { "epoch": 0.06268, "grad_norm": 2.421875, "grad_norm_var": 0.0081695556640625, "learning_rate": 0.0001, "loss": 4.67, "loss/crossentropy": 1.9773973226547241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26880529522895813, "step": 3134 }, { "epoch": 0.06272, "grad_norm": 2.984375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 5.1236, "loss/crossentropy": 2.208973228931427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29979653656482697, "step": 3136 }, { "epoch": 0.06276, "grad_norm": 2.8125, "grad_norm_var": 0.026676432291666666, "learning_rate": 0.0001, "loss": 5.0619, "loss/crossentropy": 2.64120090007782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29249200224876404, "step": 3138 }, { "epoch": 0.0628, "grad_norm": 2.703125, "grad_norm_var": 0.0306549072265625, "learning_rate": 0.0001, "loss": 5.0152, "loss/crossentropy": 2.293095588684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2689897269010544, "step": 3140 }, { "epoch": 0.06284, "grad_norm": 2.5625, "grad_norm_var": 0.028758748372395834, "learning_rate": 0.0001, "loss": 4.71, "loss/crossentropy": 2.455227494239807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3202812373638153, "step": 3142 }, { "epoch": 0.06288, "grad_norm": 2.578125, "grad_norm_var": 0.0253814697265625, "learning_rate": 0.0001, "loss": 4.9622, "loss/crossentropy": 1.9667487740516663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2702512592077255, "step": 3144 }, { "epoch": 0.06292, "grad_norm": 2.578125, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 4.7756, "loss/crossentropy": 1.8181490898132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2310035452246666, "step": 3146 }, { "epoch": 0.06296, "grad_norm": 2.640625, "grad_norm_var": 0.0373687744140625, "learning_rate": 0.0001, "loss": 4.9556, "loss/crossentropy": 1.9985284805297852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2944463640451431, "step": 3148 }, { "epoch": 0.063, "grad_norm": 2.28125, "grad_norm_var": 0.04142964680989583, "learning_rate": 0.0001, "loss": 5.0316, "loss/crossentropy": 2.3850419521331787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29605095088481903, "step": 3150 }, { "epoch": 0.06304, "grad_norm": 4.125, "grad_norm_var": 0.17021077473958332, "learning_rate": 0.0001, "loss": 5.2062, "loss/crossentropy": 2.3815460205078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30247962474823, "step": 3152 }, { "epoch": 0.06308, "grad_norm": 3.421875, "grad_norm_var": 0.19840087890625, "learning_rate": 0.0001, "loss": 4.5851, "loss/crossentropy": 1.995104193687439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2506560683250427, "step": 3154 }, { "epoch": 0.06312, "grad_norm": 3.328125, "grad_norm_var": 0.2305084228515625, "learning_rate": 0.0001, "loss": 4.5885, "loss/crossentropy": 2.2037696838378906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2716705799102783, "step": 3156 }, { "epoch": 0.06316, "grad_norm": 2.84375, "grad_norm_var": 0.23944905598958333, "learning_rate": 0.0001, "loss": 4.7155, "loss/crossentropy": 2.067265272140503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25495442003011703, "step": 3158 }, { "epoch": 0.0632, "grad_norm": 2.78125, "grad_norm_var": 0.23763020833333334, "learning_rate": 0.0001, "loss": 5.1219, "loss/crossentropy": 2.227355480194092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27113111317157745, "step": 3160 }, { "epoch": 0.06324, "grad_norm": 2.546875, "grad_norm_var": 0.23921610514322916, "learning_rate": 0.0001, "loss": 4.6777, "loss/crossentropy": 1.9077441096305847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2630976662039757, "step": 3162 }, { "epoch": 0.06328, "grad_norm": 2.59375, "grad_norm_var": 0.23515523274739583, "learning_rate": 0.0001, "loss": 5.0084, "loss/crossentropy": 2.1737552881240845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28365984559059143, "step": 3164 }, { "epoch": 0.06332, "grad_norm": 2.5625, "grad_norm_var": 0.22542317708333334, "learning_rate": 0.0001, "loss": 4.9822, "loss/crossentropy": 1.9357402920722961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27245980501174927, "step": 3166 }, { "epoch": 0.06336, "grad_norm": 2.390625, "grad_norm_var": 0.0995269775390625, "learning_rate": 0.0001, "loss": 4.7681, "loss/crossentropy": 1.8484191298484802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2933400571346283, "step": 3168 }, { "epoch": 0.0634, "grad_norm": 2.609375, "grad_norm_var": 0.057738240559895834, "learning_rate": 0.0001, "loss": 5.0719, "loss/crossentropy": 2.3085306882858276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2926720678806305, "step": 3170 }, { "epoch": 0.06344, "grad_norm": 2.78125, "grad_norm_var": 0.040380859375, "learning_rate": 0.0001, "loss": 5.1464, "loss/crossentropy": 2.080895185470581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638559564948082, "step": 3172 }, { "epoch": 0.06348, "grad_norm": 2.53125, "grad_norm_var": 0.03287353515625, "learning_rate": 0.0001, "loss": 4.6857, "loss/crossentropy": 2.035506248474121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692716419696808, "step": 3174 }, { "epoch": 0.06352, "grad_norm": 2.859375, "grad_norm_var": 0.035252888997395836, "learning_rate": 0.0001, "loss": 4.9479, "loss/crossentropy": 2.1712740659713745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26579485833644867, "step": 3176 }, { "epoch": 0.06356, "grad_norm": 2.96875, "grad_norm_var": 0.04252827962239583, "learning_rate": 0.0001, "loss": 5.1654, "loss/crossentropy": 2.2256147861480713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27605514228343964, "step": 3178 }, { "epoch": 0.0636, "grad_norm": 2.359375, "grad_norm_var": 0.046507771809895834, "learning_rate": 0.0001, "loss": 4.6489, "loss/crossentropy": 2.0917118191719055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560323476791382, "step": 3180 }, { "epoch": 0.06364, "grad_norm": 2.671875, "grad_norm_var": 0.045491536458333336, "learning_rate": 0.0001, "loss": 4.9833, "loss/crossentropy": 2.3109938502311707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27040477097034454, "step": 3182 }, { "epoch": 0.06368, "grad_norm": 2.65625, "grad_norm_var": 0.03984273274739583, "learning_rate": 0.0001, "loss": 5.1441, "loss/crossentropy": 2.3505672812461853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29180124402046204, "step": 3184 }, { "epoch": 0.06372, "grad_norm": 2.578125, "grad_norm_var": 0.04014383951822917, "learning_rate": 0.0001, "loss": 5.0224, "loss/crossentropy": 2.2471169233322144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2676118314266205, "step": 3186 }, { "epoch": 0.06376, "grad_norm": 2.46875, "grad_norm_var": 0.0264312744140625, "learning_rate": 0.0001, "loss": 4.8247, "loss/crossentropy": 2.3312125205993652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29243919253349304, "step": 3188 }, { "epoch": 0.0638, "grad_norm": 2.4375, "grad_norm_var": 0.02681884765625, "learning_rate": 0.0001, "loss": 4.9374, "loss/crossentropy": 2.1709930896759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2786664366722107, "step": 3190 }, { "epoch": 0.06384, "grad_norm": 2.609375, "grad_norm_var": 0.02476806640625, "learning_rate": 0.0001, "loss": 4.7391, "loss/crossentropy": 1.8477665185928345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26132869720458984, "step": 3192 }, { "epoch": 0.06388, "grad_norm": 2.6875, "grad_norm_var": 0.01724853515625, "learning_rate": 0.0001, "loss": 4.8167, "loss/crossentropy": 2.2102121114730835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26027651876211166, "step": 3194 }, { "epoch": 0.06392, "grad_norm": 2.640625, "grad_norm_var": 0.0168365478515625, "learning_rate": 0.0001, "loss": 4.6381, "loss/crossentropy": 2.0011088252067566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25629256665706635, "step": 3196 }, { "epoch": 0.06396, "grad_norm": 2.625, "grad_norm_var": 0.01685791015625, "learning_rate": 0.0001, "loss": 4.9215, "loss/crossentropy": 2.319412350654602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28315384685993195, "step": 3198 }, { "epoch": 0.064, "grad_norm": 2.328125, "grad_norm_var": 0.0145172119140625, "learning_rate": 0.0001, "loss": 4.7936, "loss/crossentropy": 2.192026138305664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.264492504298687, "step": 3200 }, { "epoch": 0.06404, "grad_norm": 2.78125, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 4.8178, "loss/crossentropy": 2.1745734214782715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29369065165519714, "step": 3202 }, { "epoch": 0.06408, "grad_norm": 3.109375, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 5.577, "loss/crossentropy": 2.5039013624191284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835303544998169, "step": 3204 }, { "epoch": 0.06412, "grad_norm": 2.53125, "grad_norm_var": 0.04075419108072917, "learning_rate": 0.0001, "loss": 5.0308, "loss/crossentropy": 2.4255706071853638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3014884740114212, "step": 3206 }, { "epoch": 0.06416, "grad_norm": 2.5625, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 5.002, "loss/crossentropy": 2.266150116920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2698349952697754, "step": 3208 }, { "epoch": 0.0642, "grad_norm": 2.4375, "grad_norm_var": 0.03805338541666667, "learning_rate": 0.0001, "loss": 5.0173, "loss/crossentropy": 2.2042946815490723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28830648958683014, "step": 3210 }, { "epoch": 0.06424, "grad_norm": 2.5625, "grad_norm_var": 0.03578999837239583, "learning_rate": 0.0001, "loss": 5.0292, "loss/crossentropy": 2.4034690856933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2834039032459259, "step": 3212 }, { "epoch": 0.06428, "grad_norm": 2.4375, "grad_norm_var": 0.0444000244140625, "learning_rate": 0.0001, "loss": 4.6655, "loss/crossentropy": 2.1347755193710327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26405099034309387, "step": 3214 }, { "epoch": 0.06432, "grad_norm": 3.34375, "grad_norm_var": 0.07415262858072917, "learning_rate": 0.0001, "loss": 5.0862, "loss/crossentropy": 2.018262207508087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27144598215818405, "step": 3216 }, { "epoch": 0.06436, "grad_norm": 2.46875, "grad_norm_var": 0.076953125, "learning_rate": 0.0001, "loss": 4.3969, "loss/crossentropy": 2.0254003405570984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24976836144924164, "step": 3218 }, { "epoch": 0.0644, "grad_norm": 2.78125, "grad_norm_var": 0.06026102701822917, "learning_rate": 0.0001, "loss": 4.6779, "loss/crossentropy": 2.1952659487724304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2786559462547302, "step": 3220 }, { "epoch": 0.06444, "grad_norm": 2.609375, "grad_norm_var": 0.06024983723958333, "learning_rate": 0.0001, "loss": 4.8828, "loss/crossentropy": 2.1383036375045776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2570757120847702, "step": 3222 }, { "epoch": 0.06448, "grad_norm": 2.65625, "grad_norm_var": 0.05915425618489583, "learning_rate": 0.0001, "loss": 4.8248, "loss/crossentropy": 2.267812967300415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2774328589439392, "step": 3224 }, { "epoch": 0.06452, "grad_norm": 2.515625, "grad_norm_var": 0.0587066650390625, "learning_rate": 0.0001, "loss": 4.9389, "loss/crossentropy": 1.906205415725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2528962790966034, "step": 3226 }, { "epoch": 0.06456, "grad_norm": 2.453125, "grad_norm_var": 0.05895182291666667, "learning_rate": 0.0001, "loss": 4.9823, "loss/crossentropy": 2.1628336906433105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27087944746017456, "step": 3228 }, { "epoch": 0.0646, "grad_norm": 2.390625, "grad_norm_var": 0.05807291666666667, "learning_rate": 0.0001, "loss": 4.5893, "loss/crossentropy": 1.8845162391662598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2241554707288742, "step": 3230 }, { "epoch": 0.06464, "grad_norm": 2.390625, "grad_norm_var": 0.019237263997395834, "learning_rate": 0.0001, "loss": 4.9143, "loss/crossentropy": 2.4157146215438843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2770439088344574, "step": 3232 }, { "epoch": 0.06468, "grad_norm": 2.515625, "grad_norm_var": 0.021825154622395832, "learning_rate": 0.0001, "loss": 4.832, "loss/crossentropy": 1.9979816675186157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23696578294038773, "step": 3234 }, { "epoch": 0.06472, "grad_norm": 2.484375, "grad_norm_var": 0.016364542643229167, "learning_rate": 0.0001, "loss": 5.1123, "loss/crossentropy": 2.1790190935134888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.252600260078907, "step": 3236 }, { "epoch": 0.06476, "grad_norm": 2.4375, "grad_norm_var": 0.016624959309895833, "learning_rate": 0.0001, "loss": 4.5943, "loss/crossentropy": 2.1612111926078796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27929094433784485, "step": 3238 }, { "epoch": 0.0648, "grad_norm": 2.546875, "grad_norm_var": 0.08124898274739584, "learning_rate": 0.0001, "loss": 4.8293, "loss/crossentropy": 2.261025071144104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3034510314464569, "step": 3240 }, { "epoch": 0.06484, "grad_norm": 2.46875, "grad_norm_var": 0.081689453125, "learning_rate": 0.0001, "loss": 4.8243, "loss/crossentropy": 2.02247554063797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655429244041443, "step": 3242 }, { "epoch": 0.06488, "grad_norm": 2.390625, "grad_norm_var": 0.08289286295572916, "learning_rate": 0.0001, "loss": 4.9428, "loss/crossentropy": 2.495269775390625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32837189733982086, "step": 3244 }, { "epoch": 0.06492, "grad_norm": 2.53125, "grad_norm_var": 0.07550455729166666, "learning_rate": 0.0001, "loss": 4.9928, "loss/crossentropy": 2.365166425704956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27000221610069275, "step": 3246 }, { "epoch": 0.06496, "grad_norm": 2.4375, "grad_norm_var": 0.0746246337890625, "learning_rate": 0.0001, "loss": 4.837, "loss/crossentropy": 1.8728906512260437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2711311876773834, "step": 3248 }, { "epoch": 0.065, "grad_norm": 2.375, "grad_norm_var": 0.07427978515625, "learning_rate": 0.0001, "loss": 4.7916, "loss/crossentropy": 1.916531264781952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23036544770002365, "step": 3250 }, { "epoch": 0.06504, "grad_norm": 2.421875, "grad_norm_var": 0.07463277180989583, "learning_rate": 0.0001, "loss": 4.9399, "loss/crossentropy": 1.996503233909607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24317056685686111, "step": 3252 }, { "epoch": 0.06508, "grad_norm": 2.5625, "grad_norm_var": 0.07366536458333334, "learning_rate": 0.0001, "loss": 4.8746, "loss/crossentropy": 2.101921260356903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775810658931732, "step": 3254 }, { "epoch": 0.06512, "grad_norm": 2.40625, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 4.7526, "loss/crossentropy": 1.9286046028137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23964455723762512, "step": 3256 }, { "epoch": 0.06516, "grad_norm": 2.40625, "grad_norm_var": 0.005464680989583333, "learning_rate": 0.0001, "loss": 4.8216, "loss/crossentropy": 2.224915862083435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27516382932662964, "step": 3258 }, { "epoch": 0.0652, "grad_norm": 2.34375, "grad_norm_var": 0.0133697509765625, "learning_rate": 0.0001, "loss": 4.6553, "loss/crossentropy": 2.259337306022644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27185751497745514, "step": 3260 }, { "epoch": 0.06524, "grad_norm": 2.484375, "grad_norm_var": 0.013117472330729166, "learning_rate": 0.0001, "loss": 4.7095, "loss/crossentropy": 2.1081044673919678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23740330338478088, "step": 3262 }, { "epoch": 0.06528, "grad_norm": 2.546875, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 4.6375, "loss/crossentropy": 1.9032058119773865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2508246824145317, "step": 3264 }, { "epoch": 0.06532, "grad_norm": 2.671875, "grad_norm_var": 0.013353474934895833, "learning_rate": 0.0001, "loss": 4.7165, "loss/crossentropy": 2.0773792266845703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2797544598579407, "step": 3266 }, { "epoch": 0.06536, "grad_norm": 2.40625, "grad_norm_var": 0.013509114583333334, "learning_rate": 0.0001, "loss": 4.9136, "loss/crossentropy": 2.1591526865959167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2540442571043968, "step": 3268 }, { "epoch": 0.0654, "grad_norm": 2.546875, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 4.6248, "loss/crossentropy": 1.7735809683799744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24107928574085236, "step": 3270 }, { "epoch": 0.06544, "grad_norm": 2.5, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 4.7568, "loss/crossentropy": 2.069553792476654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26356737315654755, "step": 3272 }, { "epoch": 0.06548, "grad_norm": 2.453125, "grad_norm_var": 0.014623006184895834, "learning_rate": 0.0001, "loss": 4.7274, "loss/crossentropy": 1.9688642024993896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29453104734420776, "step": 3274 }, { "epoch": 0.06552, "grad_norm": 2.296875, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.4264, "loss/crossentropy": 1.785252034664154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23213861882686615, "step": 3276 }, { "epoch": 0.06556, "grad_norm": 2.65625, "grad_norm_var": 0.013255818684895834, "learning_rate": 0.0001, "loss": 4.779, "loss/crossentropy": 2.155774712562561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25758183747529984, "step": 3278 }, { "epoch": 0.0656, "grad_norm": 2.5625, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.0001, "loss": 5.0453, "loss/crossentropy": 2.0403348803520203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25007252395153046, "step": 3280 }, { "epoch": 0.06564, "grad_norm": 2.4375, "grad_norm_var": 0.015208943684895834, "learning_rate": 0.0001, "loss": 4.7161, "loss/crossentropy": 1.9963608384132385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2608266994357109, "step": 3282 }, { "epoch": 0.06568, "grad_norm": 2.75, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 4.9275, "loss/crossentropy": 2.2021098732948303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28361018002033234, "step": 3284 }, { "epoch": 0.06572, "grad_norm": 2.59375, "grad_norm_var": 0.01812744140625, "learning_rate": 0.0001, "loss": 4.6533, "loss/crossentropy": 2.0363903641700745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2641526162624359, "step": 3286 }, { "epoch": 0.06576, "grad_norm": 2.671875, "grad_norm_var": 0.020466105143229166, "learning_rate": 0.0001, "loss": 4.6884, "loss/crossentropy": 2.1052531003952026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24344927817583084, "step": 3288 }, { "epoch": 0.0658, "grad_norm": 2.578125, "grad_norm_var": 0.02027587890625, "learning_rate": 0.0001, "loss": 4.9246, "loss/crossentropy": 2.0664029717445374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25679293274879456, "step": 3290 }, { "epoch": 0.06584, "grad_norm": 2.625, "grad_norm_var": 0.013678995768229167, "learning_rate": 0.0001, "loss": 4.7802, "loss/crossentropy": 2.168351709842682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2759328931570053, "step": 3292 }, { "epoch": 0.06588, "grad_norm": 2.515625, "grad_norm_var": 0.013703409830729167, "learning_rate": 0.0001, "loss": 4.8481, "loss/crossentropy": 1.9305949211120605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26165173947811127, "step": 3294 }, { "epoch": 0.06592, "grad_norm": 2.4375, "grad_norm_var": 0.017476399739583332, "learning_rate": 0.0001, "loss": 4.6942, "loss/crossentropy": 1.9257155060768127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24499841034412384, "step": 3296 }, { "epoch": 0.06596, "grad_norm": 2.484375, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 4.8941, "loss/crossentropy": 2.1406426429748535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26676414906978607, "step": 3298 }, { "epoch": 0.066, "grad_norm": 2.734375, "grad_norm_var": 0.016422526041666666, "learning_rate": 0.0001, "loss": 4.8293, "loss/crossentropy": 2.099781036376953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2916935384273529, "step": 3300 }, { "epoch": 0.06604, "grad_norm": 2.875, "grad_norm_var": 0.023094685872395833, "learning_rate": 0.0001, "loss": 4.9504, "loss/crossentropy": 2.1077913641929626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667912393808365, "step": 3302 }, { "epoch": 0.06608, "grad_norm": 3.90625, "grad_norm_var": 0.1297271728515625, "learning_rate": 0.0001, "loss": 5.2952, "loss/crossentropy": 2.2583223581314087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2907385528087616, "step": 3304 }, { "epoch": 0.06612, "grad_norm": 2.71875, "grad_norm_var": 0.13405659993489583, "learning_rate": 0.0001, "loss": 4.5813, "loss/crossentropy": 1.847477912902832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2272188812494278, "step": 3306 }, { "epoch": 0.06616, "grad_norm": 2.40625, "grad_norm_var": 0.1392486572265625, "learning_rate": 0.0001, "loss": 4.6502, "loss/crossentropy": 1.7610225677490234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23973071575164795, "step": 3308 }, { "epoch": 0.0662, "grad_norm": 2.328125, "grad_norm_var": 0.1464019775390625, "learning_rate": 0.0001, "loss": 4.7123, "loss/crossentropy": 1.977916419506073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25456882268190384, "step": 3310 }, { "epoch": 0.06624, "grad_norm": 2.734375, "grad_norm_var": 0.14095052083333334, "learning_rate": 0.0001, "loss": 4.7353, "loss/crossentropy": 2.2196428775787354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563377171754837, "step": 3312 }, { "epoch": 0.06628, "grad_norm": 2.53125, "grad_norm_var": 0.14537760416666667, "learning_rate": 0.0001, "loss": 4.6907, "loss/crossentropy": 2.0861470699310303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27121224999427795, "step": 3314 }, { "epoch": 0.06632, "grad_norm": 2.53125, "grad_norm_var": 0.14287821451822916, "learning_rate": 0.0001, "loss": 4.6484, "loss/crossentropy": 1.9716283679008484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.277963787317276, "step": 3316 }, { "epoch": 0.06636, "grad_norm": 2.609375, "grad_norm_var": 0.1375, "learning_rate": 0.0001, "loss": 4.7985, "loss/crossentropy": 2.5604729652404785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31031742691993713, "step": 3318 }, { "epoch": 0.0664, "grad_norm": 2.71875, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 5.4329, "loss/crossentropy": 2.2672252655029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29225634038448334, "step": 3320 }, { "epoch": 0.06644, "grad_norm": 2.671875, "grad_norm_var": 0.020015462239583334, "learning_rate": 0.0001, "loss": 5.0557, "loss/crossentropy": 2.0992931723594666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2648598402738571, "step": 3322 }, { "epoch": 0.06648, "grad_norm": 2.65625, "grad_norm_var": 0.014827473958333334, "learning_rate": 0.0001, "loss": 5.0895, "loss/crossentropy": 2.208917260169983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2998420298099518, "step": 3324 }, { "epoch": 0.06652, "grad_norm": 2.671875, "grad_norm_var": 0.012791951497395834, "learning_rate": 0.0001, "loss": 4.8834, "loss/crossentropy": 2.290796995162964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638905793428421, "step": 3326 }, { "epoch": 0.06656, "grad_norm": 2.375, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 4.8521, "loss/crossentropy": 2.4156445264816284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28743064403533936, "step": 3328 }, { "epoch": 0.0666, "grad_norm": 2.734375, "grad_norm_var": 0.012132771809895833, "learning_rate": 0.0001, "loss": 5.2205, "loss/crossentropy": 2.4604904651641846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27585768699645996, "step": 3330 }, { "epoch": 0.06664, "grad_norm": 2.421875, "grad_norm_var": 0.013818359375, "learning_rate": 0.0001, "loss": 4.8296, "loss/crossentropy": 1.8613844513893127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27038969844579697, "step": 3332 }, { "epoch": 0.06668, "grad_norm": 2.53125, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 4.8777, "loss/crossentropy": 2.232776403427124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2686954140663147, "step": 3334 }, { "epoch": 0.06672, "grad_norm": 2.5625, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 4.9536, "loss/crossentropy": 2.070033550262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26176171004772186, "step": 3336 }, { "epoch": 0.06676, "grad_norm": 2.4375, "grad_norm_var": 0.01187744140625, "learning_rate": 0.0001, "loss": 4.8616, "loss/crossentropy": 2.38937509059906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2539200633764267, "step": 3338 }, { "epoch": 0.0668, "grad_norm": 2.53125, "grad_norm_var": 0.010933430989583333, "learning_rate": 0.0001, "loss": 5.1104, "loss/crossentropy": 2.272845983505249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2856733053922653, "step": 3340 }, { "epoch": 0.06684, "grad_norm": 2.5625, "grad_norm_var": 0.01181640625, "learning_rate": 0.0001, "loss": 4.7935, "loss/crossentropy": 2.427489161491394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30572691559791565, "step": 3342 }, { "epoch": 0.06688, "grad_norm": 2.59375, "grad_norm_var": 0.009407552083333333, "learning_rate": 0.0001, "loss": 4.9801, "loss/crossentropy": 2.4701327085494995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28469331562519073, "step": 3344 }, { "epoch": 0.06692, "grad_norm": 2.421875, "grad_norm_var": 0.006917317708333333, "learning_rate": 0.0001, "loss": 4.815, "loss/crossentropy": 2.0733558535575867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.292859822511673, "step": 3346 }, { "epoch": 0.06696, "grad_norm": 2.640625, "grad_norm_var": 0.0061187744140625, "learning_rate": 0.0001, "loss": 4.8379, "loss/crossentropy": 2.301755905151367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3174070864915848, "step": 3348 }, { "epoch": 0.067, "grad_norm": 2.234375, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 4.3359, "loss/crossentropy": 2.039419114589691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24543144553899765, "step": 3350 }, { "epoch": 0.06704, "grad_norm": 2.4375, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 4.7938, "loss/crossentropy": 1.9247611165046692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595005929470062, "step": 3352 }, { "epoch": 0.06708, "grad_norm": 2.34375, "grad_norm_var": 0.01630859375, "learning_rate": 0.0001, "loss": 4.8294, "loss/crossentropy": 2.224974751472473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26880575716495514, "step": 3354 }, { "epoch": 0.06712, "grad_norm": 2.734375, "grad_norm_var": 0.0195220947265625, "learning_rate": 0.0001, "loss": 4.8922, "loss/crossentropy": 2.2601993083953857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.279633030295372, "step": 3356 }, { "epoch": 0.06716, "grad_norm": 2.4375, "grad_norm_var": 0.017085774739583334, "learning_rate": 0.0001, "loss": 4.5911, "loss/crossentropy": 1.8156417608261108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22118539363145828, "step": 3358 }, { "epoch": 0.0672, "grad_norm": 2.734375, "grad_norm_var": 0.020140584309895834, "learning_rate": 0.0001, "loss": 5.2032, "loss/crossentropy": 2.207859516143799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2771998345851898, "step": 3360 }, { "epoch": 0.06724, "grad_norm": 2.453125, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 4.9352, "loss/crossentropy": 1.9886083602905273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545766308903694, "step": 3362 }, { "epoch": 0.06728, "grad_norm": 2.234375, "grad_norm_var": 0.025777180989583332, "learning_rate": 0.0001, "loss": 4.4323, "loss/crossentropy": 1.7046592235565186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207612618803978, "step": 3364 }, { "epoch": 0.06732, "grad_norm": 2.453125, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 5.0209, "loss/crossentropy": 2.0283663868904114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2743644416332245, "step": 3366 }, { "epoch": 0.06736, "grad_norm": 2.453125, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 4.4788, "loss/crossentropy": 1.975772500038147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505730241537094, "step": 3368 }, { "epoch": 0.0674, "grad_norm": 2.5, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 4.6863, "loss/crossentropy": 1.9021872282028198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25430944561958313, "step": 3370 }, { "epoch": 0.06744, "grad_norm": 2.671875, "grad_norm_var": 0.0182037353515625, "learning_rate": 0.0001, "loss": 5.1859, "loss/crossentropy": 2.3888463973999023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27896443009376526, "step": 3372 }, { "epoch": 0.06748, "grad_norm": 2.71875, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 4.6041, "loss/crossentropy": 1.7844690680503845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536455988883972, "step": 3374 }, { "epoch": 0.06752, "grad_norm": 2.703125, "grad_norm_var": 0.021442667643229166, "learning_rate": 0.0001, "loss": 4.4538, "loss/crossentropy": 1.919598639011383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26129309833049774, "step": 3376 }, { "epoch": 0.06756, "grad_norm": 2.671875, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 5.0568, "loss/crossentropy": 2.2292110919952393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25612247735261917, "step": 3378 }, { "epoch": 0.0676, "grad_norm": 3.4375, "grad_norm_var": 0.0779296875, "learning_rate": 0.0001, "loss": 4.8674, "loss/crossentropy": 2.2235841751098633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2607487738132477, "step": 3380 }, { "epoch": 0.06764, "grad_norm": 2.34375, "grad_norm_var": 0.08089192708333333, "learning_rate": 0.0001, "loss": 4.5872, "loss/crossentropy": 2.1375235319137573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2344205006957054, "step": 3382 }, { "epoch": 0.06768, "grad_norm": 2.28125, "grad_norm_var": 0.08501688639322917, "learning_rate": 0.0001, "loss": 4.6076, "loss/crossentropy": 2.020237445831299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550392523407936, "step": 3384 }, { "epoch": 0.06772, "grad_norm": 2.734375, "grad_norm_var": 0.07911783854166667, "learning_rate": 0.0001, "loss": 4.6939, "loss/crossentropy": 2.138959765434265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2742607295513153, "step": 3386 }, { "epoch": 0.06776, "grad_norm": 2.578125, "grad_norm_var": 0.07864481608072917, "learning_rate": 0.0001, "loss": 4.9065, "loss/crossentropy": 2.521559953689575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2892928719520569, "step": 3388 }, { "epoch": 0.0678, "grad_norm": 2.375, "grad_norm_var": 0.07815348307291667, "learning_rate": 0.0001, "loss": 5.049, "loss/crossentropy": 2.169550120830536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27750439941883087, "step": 3390 }, { "epoch": 0.06784, "grad_norm": 2.6875, "grad_norm_var": 0.07464090983072917, "learning_rate": 0.0001, "loss": 4.7401, "loss/crossentropy": 1.975584864616394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557516545057297, "step": 3392 }, { "epoch": 0.06788, "grad_norm": 2.6875, "grad_norm_var": 0.07617899576822916, "learning_rate": 0.0001, "loss": 4.9207, "loss/crossentropy": 2.5837322473526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3444886952638626, "step": 3394 }, { "epoch": 0.06792, "grad_norm": 2.546875, "grad_norm_var": 0.022294108072916666, "learning_rate": 0.0001, "loss": 4.743, "loss/crossentropy": 1.963110864162445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24120041728019714, "step": 3396 }, { "epoch": 0.06796, "grad_norm": 2.65625, "grad_norm_var": 0.0211090087890625, "learning_rate": 0.0001, "loss": 4.5759, "loss/crossentropy": 2.381603956222534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26319295167922974, "step": 3398 }, { "epoch": 0.068, "grad_norm": 2.25, "grad_norm_var": 0.023078409830729167, "learning_rate": 0.0001, "loss": 4.5642, "loss/crossentropy": 2.054026961326599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24540280550718307, "step": 3400 }, { "epoch": 0.06804, "grad_norm": 2.46875, "grad_norm_var": 0.0187652587890625, "learning_rate": 0.0001, "loss": 4.3661, "loss/crossentropy": 2.047453820705414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2524164840579033, "step": 3402 }, { "epoch": 0.06808, "grad_norm": 2.390625, "grad_norm_var": 0.018317667643229167, "learning_rate": 0.0001, "loss": 4.7839, "loss/crossentropy": 1.8616933226585388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23905867338180542, "step": 3404 }, { "epoch": 0.06812, "grad_norm": 2.4375, "grad_norm_var": 0.016422526041666666, "learning_rate": 0.0001, "loss": 4.6932, "loss/crossentropy": 2.4304568767547607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2802550047636032, "step": 3406 }, { "epoch": 0.06816, "grad_norm": 2.5, "grad_norm_var": 0.01318359375, "learning_rate": 0.0001, "loss": 4.7005, "loss/crossentropy": 1.8178748488426208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24266308546066284, "step": 3408 }, { "epoch": 0.0682, "grad_norm": 2.625, "grad_norm_var": 0.0116363525390625, "learning_rate": 0.0001, "loss": 5.0152, "loss/crossentropy": 2.025859773159027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.282450333237648, "step": 3410 }, { "epoch": 0.06824, "grad_norm": 2.53125, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 4.737, "loss/crossentropy": 2.032994568347931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26047058403491974, "step": 3412 }, { "epoch": 0.06828, "grad_norm": 2.46875, "grad_norm_var": 0.007840983072916667, "learning_rate": 0.0001, "loss": 4.6428, "loss/crossentropy": 2.2468607425689697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814445495605469, "step": 3414 }, { "epoch": 0.06832, "grad_norm": 2.75, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 4.6854, "loss/crossentropy": 2.2534161806106567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3231179416179657, "step": 3416 }, { "epoch": 0.06836, "grad_norm": 2.65625, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 4.9599, "loss/crossentropy": 2.1678181886672974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2840191572904587, "step": 3418 }, { "epoch": 0.0684, "grad_norm": 2.328125, "grad_norm_var": 0.03453776041666667, "learning_rate": 0.0001, "loss": 4.8667, "loss/crossentropy": 2.053459882736206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.271483838558197, "step": 3420 }, { "epoch": 0.06844, "grad_norm": 2.53125, "grad_norm_var": 0.03243815104166667, "learning_rate": 0.0001, "loss": 4.714, "loss/crossentropy": 2.2278919219970703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2781473994255066, "step": 3422 }, { "epoch": 0.06848, "grad_norm": 2.296875, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 4.5262, "loss/crossentropy": 1.9582479000091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24171434342861176, "step": 3424 }, { "epoch": 0.06852, "grad_norm": 2.359375, "grad_norm_var": 0.04121805826822917, "learning_rate": 0.0001, "loss": 4.4734, "loss/crossentropy": 1.808964192867279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22458729147911072, "step": 3426 }, { "epoch": 0.06856, "grad_norm": 2.40625, "grad_norm_var": 0.04296468098958333, "learning_rate": 0.0001, "loss": 4.464, "loss/crossentropy": 1.9225260019302368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2508438527584076, "step": 3428 }, { "epoch": 0.0686, "grad_norm": 2.578125, "grad_norm_var": 0.044465128580729166, "learning_rate": 0.0001, "loss": 4.9647, "loss/crossentropy": 2.2446881532669067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26919613778591156, "step": 3430 }, { "epoch": 0.06864, "grad_norm": 2.515625, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 4.6865, "loss/crossentropy": 2.2047033309936523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2581355720758438, "step": 3432 }, { "epoch": 0.06868, "grad_norm": 2.46875, "grad_norm_var": 0.015729777018229165, "learning_rate": 0.0001, "loss": 4.7139, "loss/crossentropy": 2.1223543882369995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25814756751060486, "step": 3434 }, { "epoch": 0.06872, "grad_norm": 2.59375, "grad_norm_var": 0.016813151041666665, "learning_rate": 0.0001, "loss": 4.6771, "loss/crossentropy": 2.1641053557395935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25222497433423996, "step": 3436 }, { "epoch": 0.06876, "grad_norm": 2.640625, "grad_norm_var": 0.0189605712890625, "learning_rate": 0.0001, "loss": 4.6972, "loss/crossentropy": 1.9569795727729797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2681007981300354, "step": 3438 }, { "epoch": 0.0688, "grad_norm": 2.453125, "grad_norm_var": 0.013216145833333333, "learning_rate": 0.0001, "loss": 4.8214, "loss/crossentropy": 2.212220251560211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28191742300987244, "step": 3440 }, { "epoch": 0.06884, "grad_norm": 2.34375, "grad_norm_var": 0.011546834309895834, "learning_rate": 0.0001, "loss": 4.4162, "loss/crossentropy": 1.9564262628555298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560906559228897, "step": 3442 }, { "epoch": 0.06888, "grad_norm": 2.46875, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.6837, "loss/crossentropy": 1.8553346395492554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24626825004816055, "step": 3444 }, { "epoch": 0.06892, "grad_norm": 2.3125, "grad_norm_var": 0.012007649739583333, "learning_rate": 0.0001, "loss": 4.7914, "loss/crossentropy": 1.9803723692893982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2712126225233078, "step": 3446 }, { "epoch": 0.06896, "grad_norm": 2.46875, "grad_norm_var": 0.010423787434895833, "learning_rate": 0.0001, "loss": 4.857, "loss/crossentropy": 1.9914751648902893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565220594406128, "step": 3448 }, { "epoch": 0.069, "grad_norm": 2.484375, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.8013, "loss/crossentropy": 2.2114094495773315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28365010023117065, "step": 3450 }, { "epoch": 0.06904, "grad_norm": 2.453125, "grad_norm_var": 0.009577433268229166, "learning_rate": 0.0001, "loss": 5.0322, "loss/crossentropy": 2.4366514682769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30788426101207733, "step": 3452 }, { "epoch": 0.06908, "grad_norm": 2.671875, "grad_norm_var": 0.010595703125, "learning_rate": 0.0001, "loss": 4.7902, "loss/crossentropy": 2.304569959640503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2835633158683777, "step": 3454 }, { "epoch": 0.06912, "grad_norm": 2.65625, "grad_norm_var": 0.027318318684895832, "learning_rate": 0.0001, "loss": 5.151, "loss/crossentropy": 2.2518080472946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.278719499707222, "step": 3456 }, { "epoch": 0.06916, "grad_norm": 2.71875, "grad_norm_var": 0.027904256184895834, "learning_rate": 0.0001, "loss": 5.1688, "loss/crossentropy": 2.333768129348755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28694969415664673, "step": 3458 }, { "epoch": 0.0692, "grad_norm": 2.46875, "grad_norm_var": 0.0260650634765625, "learning_rate": 0.0001, "loss": 5.1421, "loss/crossentropy": 2.3534432649612427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3005864769220352, "step": 3460 }, { "epoch": 0.06924, "grad_norm": 2.4375, "grad_norm_var": 0.022981770833333335, "learning_rate": 0.0001, "loss": 4.9502, "loss/crossentropy": 2.0703017711639404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25969168543815613, "step": 3462 }, { "epoch": 0.06928, "grad_norm": 2.65625, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 5.0258, "loss/crossentropy": 2.167420506477356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32322582602500916, "step": 3464 }, { "epoch": 0.06932, "grad_norm": 2.515625, "grad_norm_var": 0.021222941080729165, "learning_rate": 0.0001, "loss": 4.5342, "loss/crossentropy": 2.0845181941986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26804475486278534, "step": 3466 }, { "epoch": 0.06936, "grad_norm": 2.4375, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.0459, "loss/crossentropy": 2.4165114164352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2866530567407608, "step": 3468 }, { "epoch": 0.0694, "grad_norm": 2.765625, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 5.0938, "loss/crossentropy": 2.4152863025665283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30185502767562866, "step": 3470 }, { "epoch": 0.06944, "grad_norm": 2.28125, "grad_norm_var": 0.014924112955729167, "learning_rate": 0.0001, "loss": 4.6362, "loss/crossentropy": 2.1878501176834106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2736133933067322, "step": 3472 }, { "epoch": 0.06948, "grad_norm": 2.4375, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 4.812, "loss/crossentropy": 2.055173695087433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2609568238258362, "step": 3474 }, { "epoch": 0.06952, "grad_norm": 2.453125, "grad_norm_var": 0.015550740559895833, "learning_rate": 0.0001, "loss": 4.8201, "loss/crossentropy": 2.050000250339508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713121324777603, "step": 3476 }, { "epoch": 0.06956, "grad_norm": 2.65625, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 4.9916, "loss/crossentropy": 2.227464199066162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710970640182495, "step": 3478 }, { "epoch": 0.0696, "grad_norm": 2.46875, "grad_norm_var": 0.016828409830729165, "learning_rate": 0.0001, "loss": 4.7435, "loss/crossentropy": 2.096015691757202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29556676745414734, "step": 3480 }, { "epoch": 0.06964, "grad_norm": 2.921875, "grad_norm_var": 0.028804524739583334, "learning_rate": 0.0001, "loss": 4.6738, "loss/crossentropy": 1.9252901673316956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2976933419704437, "step": 3482 }, { "epoch": 0.06968, "grad_norm": 2.25, "grad_norm_var": 0.03385009765625, "learning_rate": 0.0001, "loss": 4.7679, "loss/crossentropy": 2.258090019226074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27399829030036926, "step": 3484 }, { "epoch": 0.06972, "grad_norm": 2.265625, "grad_norm_var": 0.03192952473958333, "learning_rate": 0.0001, "loss": 4.7614, "loss/crossentropy": 2.2776867151260376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29224833846092224, "step": 3486 }, { "epoch": 0.06976, "grad_norm": 2.703125, "grad_norm_var": 0.031819661458333336, "learning_rate": 0.0001, "loss": 5.1628, "loss/crossentropy": 2.097061276435852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26103661954402924, "step": 3488 }, { "epoch": 0.0698, "grad_norm": 2.28125, "grad_norm_var": 0.03264567057291667, "learning_rate": 0.0001, "loss": 4.7364, "loss/crossentropy": 2.206419885158539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2775641232728958, "step": 3490 }, { "epoch": 0.06984, "grad_norm": 2.59375, "grad_norm_var": 0.035563151041666664, "learning_rate": 0.0001, "loss": 4.7216, "loss/crossentropy": 1.962704062461853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2421455979347229, "step": 3492 }, { "epoch": 0.06988, "grad_norm": 2.4375, "grad_norm_var": 0.032698567708333334, "learning_rate": 0.0001, "loss": 4.7358, "loss/crossentropy": 2.223302483558655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2709382176399231, "step": 3494 }, { "epoch": 0.06992, "grad_norm": 2.390625, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 4.8563, "loss/crossentropy": 2.259950876235962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29514479637145996, "step": 3496 }, { "epoch": 0.06996, "grad_norm": 2.359375, "grad_norm_var": 0.021141560872395833, "learning_rate": 0.0001, "loss": 4.6147, "loss/crossentropy": 2.2337416410446167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585422098636627, "step": 3498 }, { "epoch": 0.07, "grad_norm": 2.75, "grad_norm_var": 0.022663370768229166, "learning_rate": 0.0001, "loss": 4.9278, "loss/crossentropy": 2.131904423236847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32493171095848083, "step": 3500 }, { "epoch": 0.07004, "grad_norm": 3.109375, "grad_norm_var": 0.04553629557291667, "learning_rate": 0.0001, "loss": 4.9285, "loss/crossentropy": 2.461912155151367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2917899489402771, "step": 3502 }, { "epoch": 0.07008, "grad_norm": 2.40625, "grad_norm_var": 0.04366861979166667, "learning_rate": 0.0001, "loss": 4.6466, "loss/crossentropy": 2.05659943819046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24786780774593353, "step": 3504 }, { "epoch": 0.07012, "grad_norm": 2.625, "grad_norm_var": 0.0412261962890625, "learning_rate": 0.0001, "loss": 4.7282, "loss/crossentropy": 2.3972705602645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29012058675289154, "step": 3506 }, { "epoch": 0.07016, "grad_norm": 2.390625, "grad_norm_var": 0.03945210774739583, "learning_rate": 0.0001, "loss": 4.6561, "loss/crossentropy": 1.8465647101402283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26938022673130035, "step": 3508 }, { "epoch": 0.0702, "grad_norm": 2.65625, "grad_norm_var": 0.0434722900390625, "learning_rate": 0.0001, "loss": 4.9007, "loss/crossentropy": 2.2447493076324463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27260421216487885, "step": 3510 }, { "epoch": 0.07024, "grad_norm": 2.453125, "grad_norm_var": 0.042464192708333334, "learning_rate": 0.0001, "loss": 4.7774, "loss/crossentropy": 2.4258209466934204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.285652831196785, "step": 3512 }, { "epoch": 0.07028, "grad_norm": 2.328125, "grad_norm_var": 0.04248046875, "learning_rate": 0.0001, "loss": 4.7353, "loss/crossentropy": 2.1033068895339966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628294378519058, "step": 3514 }, { "epoch": 0.07032, "grad_norm": 2.5, "grad_norm_var": 0.03819071451822917, "learning_rate": 0.0001, "loss": 4.8036, "loss/crossentropy": 2.2740964889526367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27228833734989166, "step": 3516 }, { "epoch": 0.07036, "grad_norm": 2.265625, "grad_norm_var": 0.015104166666666667, "learning_rate": 0.0001, "loss": 4.7699, "loss/crossentropy": 2.2315655946731567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26613348722457886, "step": 3518 }, { "epoch": 0.0704, "grad_norm": 2.359375, "grad_norm_var": 0.014720662434895834, "learning_rate": 0.0001, "loss": 4.6038, "loss/crossentropy": 1.9001839756965637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.248238705098629, "step": 3520 }, { "epoch": 0.07044, "grad_norm": 2.59375, "grad_norm_var": 0.013655598958333333, "learning_rate": 0.0001, "loss": 4.7034, "loss/crossentropy": 2.0940937399864197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28850243985652924, "step": 3522 }, { "epoch": 0.07048, "grad_norm": 2.421875, "grad_norm_var": 0.014338175455729166, "learning_rate": 0.0001, "loss": 4.9187, "loss/crossentropy": 1.9088054299354553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2577759325504303, "step": 3524 }, { "epoch": 0.07052, "grad_norm": 2.359375, "grad_norm_var": 0.01031494140625, "learning_rate": 0.0001, "loss": 4.7852, "loss/crossentropy": 2.1965672969818115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2696071192622185, "step": 3526 }, { "epoch": 0.07056, "grad_norm": 2.59375, "grad_norm_var": 0.010770670572916667, "learning_rate": 0.0001, "loss": 4.7657, "loss/crossentropy": 2.245758891105652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2646239101886749, "step": 3528 }, { "epoch": 0.0706, "grad_norm": 2.546875, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 4.7794, "loss/crossentropy": 2.180204927921295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2582162171602249, "step": 3530 }, { "epoch": 0.07064, "grad_norm": 2.71875, "grad_norm_var": 0.019562784830729166, "learning_rate": 0.0001, "loss": 5.04, "loss/crossentropy": 2.193474531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.40557755529880524, "step": 3532 }, { "epoch": 0.07068, "grad_norm": 2.78125, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 4.9562, "loss/crossentropy": 2.2667607069015503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28741554915905, "step": 3534 }, { "epoch": 0.07072, "grad_norm": 2.3125, "grad_norm_var": 0.022093709309895834, "learning_rate": 0.0001, "loss": 4.8215, "loss/crossentropy": 1.9890388250350952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2612537443637848, "step": 3536 }, { "epoch": 0.07076, "grad_norm": 2.390625, "grad_norm_var": 0.02392578125, "learning_rate": 0.0001, "loss": 4.7544, "loss/crossentropy": 1.9390615820884705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23259633034467697, "step": 3538 }, { "epoch": 0.0708, "grad_norm": 2.515625, "grad_norm_var": 0.023193359375, "learning_rate": 0.0001, "loss": 4.8227, "loss/crossentropy": 2.3050389289855957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2738967537879944, "step": 3540 }, { "epoch": 0.07084, "grad_norm": 2.265625, "grad_norm_var": 0.025862630208333334, "learning_rate": 0.0001, "loss": 4.4323, "loss/crossentropy": 2.3832077980041504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29863911867141724, "step": 3542 }, { "epoch": 0.07088, "grad_norm": 2.4375, "grad_norm_var": 0.02822265625, "learning_rate": 0.0001, "loss": 4.7101, "loss/crossentropy": 1.8186699748039246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22366883605718613, "step": 3544 }, { "epoch": 0.07092, "grad_norm": 2.46875, "grad_norm_var": 0.024235026041666666, "learning_rate": 0.0001, "loss": 4.8151, "loss/crossentropy": 2.2650288343429565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2733971029520035, "step": 3546 }, { "epoch": 0.07096, "grad_norm": 2.53125, "grad_norm_var": 0.0191070556640625, "learning_rate": 0.0001, "loss": 4.7514, "loss/crossentropy": 2.432945966720581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269818976521492, "step": 3548 }, { "epoch": 0.071, "grad_norm": 4.15625, "grad_norm_var": 0.19250386555989582, "learning_rate": 0.0001, "loss": 4.9461, "loss/crossentropy": 2.021497666835785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2488800287246704, "step": 3550 }, { "epoch": 0.07104, "grad_norm": 2.3125, "grad_norm_var": 0.2001129150390625, "learning_rate": 0.0001, "loss": 5.0341, "loss/crossentropy": 2.0840535163879395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2780974507331848, "step": 3552 }, { "epoch": 0.07108, "grad_norm": 2.515625, "grad_norm_var": 0.19724833170572917, "learning_rate": 0.0001, "loss": 4.8123, "loss/crossentropy": 2.352238416671753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2830911874771118, "step": 3554 }, { "epoch": 0.07112, "grad_norm": 2.484375, "grad_norm_var": 0.19795633951822916, "learning_rate": 0.0001, "loss": 4.9632, "loss/crossentropy": 2.395397186279297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2861028015613556, "step": 3556 }, { "epoch": 0.07116, "grad_norm": 2.84375, "grad_norm_var": 0.19630533854166668, "learning_rate": 0.0001, "loss": 5.0203, "loss/crossentropy": 2.6454248428344727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28888577222824097, "step": 3558 }, { "epoch": 0.0712, "grad_norm": 2.609375, "grad_norm_var": 0.19394429524739584, "learning_rate": 0.0001, "loss": 4.8228, "loss/crossentropy": 2.21127188205719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2676347494125366, "step": 3560 }, { "epoch": 0.07124, "grad_norm": 2.5625, "grad_norm_var": 0.19245503743489584, "learning_rate": 0.0001, "loss": 4.7502, "loss/crossentropy": 2.0187097787857056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24967321753501892, "step": 3562 }, { "epoch": 0.07128, "grad_norm": 2.4375, "grad_norm_var": 0.19409077962239582, "learning_rate": 0.0001, "loss": 4.9548, "loss/crossentropy": 2.1822619438171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24893341958522797, "step": 3564 }, { "epoch": 0.07132, "grad_norm": 2.65625, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 5.0912, "loss/crossentropy": 2.486607313156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29665203392505646, "step": 3566 }, { "epoch": 0.07136, "grad_norm": 2.546875, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 5.0494, "loss/crossentropy": 2.2631163597106934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.268096499145031, "step": 3568 }, { "epoch": 0.0714, "grad_norm": 2.671875, "grad_norm_var": 0.02760009765625, "learning_rate": 0.0001, "loss": 4.7668, "loss/crossentropy": 2.3393882513046265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29145532846450806, "step": 3570 }, { "epoch": 0.07144, "grad_norm": 2.65625, "grad_norm_var": 0.03980712890625, "learning_rate": 0.0001, "loss": 4.8281, "loss/crossentropy": 2.007299244403839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30512700974941254, "step": 3572 }, { "epoch": 0.07148, "grad_norm": 2.609375, "grad_norm_var": 0.0369049072265625, "learning_rate": 0.0001, "loss": 4.7891, "loss/crossentropy": 1.9879329800605774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22510841488838196, "step": 3574 }, { "epoch": 0.07152, "grad_norm": 2.46875, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 4.9688, "loss/crossentropy": 2.387833833694458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3007010221481323, "step": 3576 }, { "epoch": 0.07156, "grad_norm": 2.625, "grad_norm_var": 0.03664957682291667, "learning_rate": 0.0001, "loss": 4.7975, "loss/crossentropy": 2.3306411504745483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269440695643425, "step": 3578 }, { "epoch": 0.0716, "grad_norm": 2.5, "grad_norm_var": 0.03462626139322917, "learning_rate": 0.0001, "loss": 4.5348, "loss/crossentropy": 1.8359156847000122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23353691399097443, "step": 3580 }, { "epoch": 0.07164, "grad_norm": 2.5625, "grad_norm_var": 0.033080037434895834, "learning_rate": 0.0001, "loss": 4.9226, "loss/crossentropy": 2.257680654525757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2764490246772766, "step": 3582 }, { "epoch": 0.07168, "grad_norm": 2.4375, "grad_norm_var": 0.03310445149739583, "learning_rate": 0.0001, "loss": 4.6434, "loss/crossentropy": 2.589483857154846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28172188997268677, "step": 3584 }, { "epoch": 0.07172, "grad_norm": 2.390625, "grad_norm_var": 0.04182840983072917, "learning_rate": 0.0001, "loss": 4.5434, "loss/crossentropy": 1.8202016949653625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22330023348331451, "step": 3586 }, { "epoch": 0.07176, "grad_norm": 2.421875, "grad_norm_var": 0.028609212239583334, "learning_rate": 0.0001, "loss": 4.8344, "loss/crossentropy": 2.0311816334724426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2622206509113312, "step": 3588 }, { "epoch": 0.0718, "grad_norm": 2.296875, "grad_norm_var": 0.028400675455729166, "learning_rate": 0.0001, "loss": 4.5158, "loss/crossentropy": 1.7991753220558167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23093865811824799, "step": 3590 }, { "epoch": 0.07184, "grad_norm": 2.265625, "grad_norm_var": 0.030924479166666668, "learning_rate": 0.0001, "loss": 4.3148, "loss/crossentropy": 1.6141473054885864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21914401650428772, "step": 3592 }, { "epoch": 0.07188, "grad_norm": 2.34375, "grad_norm_var": 0.0289703369140625, "learning_rate": 0.0001, "loss": 4.8007, "loss/crossentropy": 2.3337208032608032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2612123265862465, "step": 3594 }, { "epoch": 0.07192, "grad_norm": 2.265625, "grad_norm_var": 0.0322662353515625, "learning_rate": 0.0001, "loss": 4.5811, "loss/crossentropy": 2.191028594970703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598741352558136, "step": 3596 }, { "epoch": 0.07196, "grad_norm": 2.546875, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 4.9562, "loss/crossentropy": 2.0293691158294678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27143266797065735, "step": 3598 }, { "epoch": 0.072, "grad_norm": 2.640625, "grad_norm_var": 0.03437398274739583, "learning_rate": 0.0001, "loss": 5.1335, "loss/crossentropy": 2.1257725954055786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33300966024398804, "step": 3600 }, { "epoch": 0.07204, "grad_norm": 2.375, "grad_norm_var": 0.0171875, "learning_rate": 0.0001, "loss": 4.8223, "loss/crossentropy": 2.296278953552246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28724825382232666, "step": 3602 }, { "epoch": 0.07208, "grad_norm": 2.453125, "grad_norm_var": 0.016731770833333333, "learning_rate": 0.0001, "loss": 4.8925, "loss/crossentropy": 2.258358597755432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26510895788669586, "step": 3604 }, { "epoch": 0.07212, "grad_norm": 2.421875, "grad_norm_var": 0.0170318603515625, "learning_rate": 0.0001, "loss": 5.0383, "loss/crossentropy": 2.0454649925231934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2725224494934082, "step": 3606 }, { "epoch": 0.07216, "grad_norm": 2.390625, "grad_norm_var": 0.015250651041666667, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.1844204664230347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729914039373398, "step": 3608 }, { "epoch": 0.0722, "grad_norm": 2.484375, "grad_norm_var": 0.015608723958333333, "learning_rate": 0.0001, "loss": 4.4613, "loss/crossentropy": 1.8897106647491455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26307350397109985, "step": 3610 }, { "epoch": 0.07224, "grad_norm": 2.5, "grad_norm_var": 0.013377888997395834, "learning_rate": 0.0001, "loss": 4.5695, "loss/crossentropy": 1.9441962838172913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24454529583454132, "step": 3612 }, { "epoch": 0.07228, "grad_norm": 2.6875, "grad_norm_var": 0.063623046875, "learning_rate": 0.0001, "loss": 4.7654, "loss/crossentropy": 2.10969078540802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2900787442922592, "step": 3614 }, { "epoch": 0.07232, "grad_norm": 2.453125, "grad_norm_var": 0.06301676432291667, "learning_rate": 0.0001, "loss": 4.5195, "loss/crossentropy": 2.1384644508361816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2722831964492798, "step": 3616 }, { "epoch": 0.07236, "grad_norm": 2.296875, "grad_norm_var": 0.06412760416666667, "learning_rate": 0.0001, "loss": 4.4995, "loss/crossentropy": 2.0648157596588135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24860350787639618, "step": 3618 }, { "epoch": 0.0724, "grad_norm": 2.421875, "grad_norm_var": 0.06441650390625, "learning_rate": 0.0001, "loss": 4.7863, "loss/crossentropy": 2.2188034057617188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2476331740617752, "step": 3620 }, { "epoch": 0.07244, "grad_norm": 2.4375, "grad_norm_var": 0.06516927083333333, "learning_rate": 0.0001, "loss": 4.792, "loss/crossentropy": 2.1361395120620728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24962469190359116, "step": 3622 }, { "epoch": 0.07248, "grad_norm": 2.46875, "grad_norm_var": 0.06457417805989583, "learning_rate": 0.0001, "loss": 4.739, "loss/crossentropy": 2.2646392583847046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.260420486330986, "step": 3624 }, { "epoch": 0.07252, "grad_norm": 2.421875, "grad_norm_var": 0.0630523681640625, "learning_rate": 0.0001, "loss": 4.6937, "loss/crossentropy": 2.2822424173355103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585812509059906, "step": 3626 }, { "epoch": 0.07256, "grad_norm": 2.203125, "grad_norm_var": 0.06852925618489583, "learning_rate": 0.0001, "loss": 4.2868, "loss/crossentropy": 1.8197516798973083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24321961402893066, "step": 3628 }, { "epoch": 0.0726, "grad_norm": 2.4375, "grad_norm_var": 0.007649739583333333, "learning_rate": 0.0001, "loss": 4.4835, "loss/crossentropy": 2.1475032567977905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687895894050598, "step": 3630 }, { "epoch": 0.07264, "grad_norm": 2.25, "grad_norm_var": 0.006538899739583334, "learning_rate": 0.0001, "loss": 4.2353, "loss/crossentropy": 1.9497992992401123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24942665547132492, "step": 3632 }, { "epoch": 0.07268, "grad_norm": 2.5, "grad_norm_var": 0.007059733072916667, "learning_rate": 0.0001, "loss": 4.7455, "loss/crossentropy": 1.8786492347717285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23713821917772293, "step": 3634 }, { "epoch": 0.07272, "grad_norm": 2.4375, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 5.089, "loss/crossentropy": 2.474532127380371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3133770227432251, "step": 3636 }, { "epoch": 0.07276, "grad_norm": 2.484375, "grad_norm_var": 0.0073394775390625, "learning_rate": 0.0001, "loss": 4.912, "loss/crossentropy": 2.1231455206871033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2547219544649124, "step": 3638 }, { "epoch": 0.0728, "grad_norm": 2.546875, "grad_norm_var": 0.009007771809895834, "learning_rate": 0.0001, "loss": 4.4727, "loss/crossentropy": 2.0511630177497864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26947300136089325, "step": 3640 }, { "epoch": 0.07284, "grad_norm": 2.5625, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.7332, "loss/crossentropy": 1.7076187133789062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21764510869979858, "step": 3642 }, { "epoch": 0.07288, "grad_norm": 2.359375, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 4.8396, "loss/crossentropy": 2.069926142692566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26851218193769455, "step": 3644 }, { "epoch": 0.07292, "grad_norm": 2.609375, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 5.0802, "loss/crossentropy": 2.1369277238845825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27878354489803314, "step": 3646 }, { "epoch": 0.07296, "grad_norm": 2.375, "grad_norm_var": 0.008072916666666667, "learning_rate": 0.0001, "loss": 4.3659, "loss/crossentropy": 2.095974624156952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25285808742046356, "step": 3648 }, { "epoch": 0.073, "grad_norm": 2.5625, "grad_norm_var": 0.011888631184895833, "learning_rate": 0.0001, "loss": 4.9301, "loss/crossentropy": 2.240627646446228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24894578754901886, "step": 3650 }, { "epoch": 0.07304, "grad_norm": 2.359375, "grad_norm_var": 0.013899739583333333, "learning_rate": 0.0001, "loss": 4.6588, "loss/crossentropy": 2.2270851135253906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26908691227436066, "step": 3652 }, { "epoch": 0.07308, "grad_norm": 2.671875, "grad_norm_var": 0.017039998372395834, "learning_rate": 0.0001, "loss": 4.7385, "loss/crossentropy": 2.2684017419815063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28442947566509247, "step": 3654 }, { "epoch": 0.07312, "grad_norm": 2.46875, "grad_norm_var": 0.07785542805989583, "learning_rate": 0.0001, "loss": 4.7585, "loss/crossentropy": 2.0922030806541443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2621888816356659, "step": 3656 }, { "epoch": 0.07316, "grad_norm": 2.3125, "grad_norm_var": 0.08323567708333333, "learning_rate": 0.0001, "loss": 4.7425, "loss/crossentropy": 2.0134947896003723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2717447876930237, "step": 3658 }, { "epoch": 0.0732, "grad_norm": 2.421875, "grad_norm_var": 0.08206278483072917, "learning_rate": 0.0001, "loss": 4.5573, "loss/crossentropy": 1.9246947765350342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25095127522945404, "step": 3660 }, { "epoch": 0.07324, "grad_norm": 2.5625, "grad_norm_var": 0.08561197916666667, "learning_rate": 0.0001, "loss": 4.8616, "loss/crossentropy": 2.0655113458633423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24166538566350937, "step": 3662 }, { "epoch": 0.07328, "grad_norm": 2.46875, "grad_norm_var": 0.08198954264322916, "learning_rate": 0.0001, "loss": 4.9137, "loss/crossentropy": 2.2706735730171204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26484307646751404, "step": 3664 }, { "epoch": 0.07332, "grad_norm": 2.359375, "grad_norm_var": 0.0883453369140625, "learning_rate": 0.0001, "loss": 4.2911, "loss/crossentropy": 1.7969809770584106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23407897353172302, "step": 3666 }, { "epoch": 0.07336, "grad_norm": 2.421875, "grad_norm_var": 0.09109700520833333, "learning_rate": 0.0001, "loss": 4.7081, "loss/crossentropy": 2.0398870706558228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502745985984802, "step": 3668 }, { "epoch": 0.0734, "grad_norm": 2.40625, "grad_norm_var": 0.09381103515625, "learning_rate": 0.0001, "loss": 4.8738, "loss/crossentropy": 2.090283453464508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2936585247516632, "step": 3670 }, { "epoch": 0.07344, "grad_norm": 2.5625, "grad_norm_var": 0.035008748372395836, "learning_rate": 0.0001, "loss": 4.8305, "loss/crossentropy": 2.286925792694092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27951307594776154, "step": 3672 }, { "epoch": 0.07348, "grad_norm": 2.40625, "grad_norm_var": 0.032059733072916666, "learning_rate": 0.0001, "loss": 4.4885, "loss/crossentropy": 2.0264610052108765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24908316135406494, "step": 3674 }, { "epoch": 0.07352, "grad_norm": 2.6875, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 4.9239, "loss/crossentropy": 2.1947755217552185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2865990996360779, "step": 3676 }, { "epoch": 0.07356, "grad_norm": 2.703125, "grad_norm_var": 0.0422515869140625, "learning_rate": 0.0001, "loss": 4.8784, "loss/crossentropy": 2.0050416588783264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23362033069133759, "step": 3678 }, { "epoch": 0.0736, "grad_norm": 2.4375, "grad_norm_var": 0.041112263997395836, "learning_rate": 0.0001, "loss": 4.7423, "loss/crossentropy": 1.8935424089431763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532464489340782, "step": 3680 }, { "epoch": 0.07364, "grad_norm": 2.40625, "grad_norm_var": 0.0350250244140625, "learning_rate": 0.0001, "loss": 4.7389, "loss/crossentropy": 2.0181053280830383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2544455900788307, "step": 3682 }, { "epoch": 0.07368, "grad_norm": 2.359375, "grad_norm_var": 0.03144124348958333, "learning_rate": 0.0001, "loss": 4.7099, "loss/crossentropy": 2.1172796487808228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27609144151210785, "step": 3684 }, { "epoch": 0.07372, "grad_norm": 2.375, "grad_norm_var": 0.023737589518229168, "learning_rate": 0.0001, "loss": 4.7185, "loss/crossentropy": 2.3926355838775635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28892992436885834, "step": 3686 }, { "epoch": 0.07376, "grad_norm": 2.21875, "grad_norm_var": 0.026883951822916665, "learning_rate": 0.0001, "loss": 4.9086, "loss/crossentropy": 2.2512835264205933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26166096329689026, "step": 3688 }, { "epoch": 0.0738, "grad_norm": 2.578125, "grad_norm_var": 0.0264312744140625, "learning_rate": 0.0001, "loss": 4.6311, "loss/crossentropy": 2.0656538009643555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25399819016456604, "step": 3690 }, { "epoch": 0.07384, "grad_norm": 2.453125, "grad_norm_var": 0.0222808837890625, "learning_rate": 0.0001, "loss": 4.9565, "loss/crossentropy": 2.454928994178772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2964669317007065, "step": 3692 }, { "epoch": 0.07388, "grad_norm": 2.4375, "grad_norm_var": 0.009859212239583333, "learning_rate": 0.0001, "loss": 4.5703, "loss/crossentropy": 1.988040804862976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27345800399780273, "step": 3694 }, { "epoch": 0.07392, "grad_norm": 2.671875, "grad_norm_var": 0.013752237955729166, "learning_rate": 0.0001, "loss": 4.9418, "loss/crossentropy": 1.910742998123169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24037255346775055, "step": 3696 }, { "epoch": 0.07396, "grad_norm": 2.421875, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 4.6541, "loss/crossentropy": 2.19545578956604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2778017520904541, "step": 3698 }, { "epoch": 0.074, "grad_norm": 2.46875, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 4.3839, "loss/crossentropy": 2.436691403388977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26254843175411224, "step": 3700 }, { "epoch": 0.07404, "grad_norm": 2.5, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 4.833, "loss/crossentropy": 2.7458308935165405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27952516078948975, "step": 3702 }, { "epoch": 0.07408, "grad_norm": 2.78125, "grad_norm_var": 0.014876302083333333, "learning_rate": 0.0001, "loss": 4.8308, "loss/crossentropy": 2.2321633100509644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2633504122495651, "step": 3704 }, { "epoch": 0.07412, "grad_norm": 2.359375, "grad_norm_var": 0.013801066080729167, "learning_rate": 0.0001, "loss": 4.8159, "loss/crossentropy": 1.9883576035499573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25319087505340576, "step": 3706 }, { "epoch": 0.07416, "grad_norm": 2.4375, "grad_norm_var": 0.0147857666015625, "learning_rate": 0.0001, "loss": 4.5546, "loss/crossentropy": 1.7647870182991028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21732009947299957, "step": 3708 }, { "epoch": 0.0742, "grad_norm": 2.734375, "grad_norm_var": 0.046793619791666664, "learning_rate": 0.0001, "loss": 4.9271, "loss/crossentropy": 2.1113381385803223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25888970494270325, "step": 3710 }, { "epoch": 0.07424, "grad_norm": 2.34375, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 4.5878, "loss/crossentropy": 1.975549578666687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23899925500154495, "step": 3712 }, { "epoch": 0.07428, "grad_norm": 2.40625, "grad_norm_var": 0.04664306640625, "learning_rate": 0.0001, "loss": 4.9262, "loss/crossentropy": 2.0562495589256287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31428879499435425, "step": 3714 }, { "epoch": 0.07432, "grad_norm": 2.265625, "grad_norm_var": 0.04951883951822917, "learning_rate": 0.0001, "loss": 4.3719, "loss/crossentropy": 2.114805221557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25172004848718643, "step": 3716 }, { "epoch": 0.07436, "grad_norm": 2.65625, "grad_norm_var": 0.052783203125, "learning_rate": 0.0001, "loss": 4.6032, "loss/crossentropy": 2.1865739822387695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.252632200717926, "step": 3718 }, { "epoch": 0.0744, "grad_norm": 2.53125, "grad_norm_var": 0.04739176432291667, "learning_rate": 0.0001, "loss": 4.8493, "loss/crossentropy": 2.2550876140594482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27425335347652435, "step": 3720 }, { "epoch": 0.07444, "grad_norm": 2.640625, "grad_norm_var": 0.047379557291666666, "learning_rate": 0.0001, "loss": 4.9072, "loss/crossentropy": 2.293414354324341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26979324221611023, "step": 3722 }, { "epoch": 0.07448, "grad_norm": 2.59375, "grad_norm_var": 0.043745930989583334, "learning_rate": 0.0001, "loss": 4.4962, "loss/crossentropy": 2.014510452747345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25709769129753113, "step": 3724 }, { "epoch": 0.07452, "grad_norm": 2.40625, "grad_norm_var": 0.015746053059895834, "learning_rate": 0.0001, "loss": 4.6485, "loss/crossentropy": 2.0332603454589844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585446834564209, "step": 3726 }, { "epoch": 0.07456, "grad_norm": 2.296875, "grad_norm_var": 0.0165191650390625, "learning_rate": 0.0001, "loss": 4.7268, "loss/crossentropy": 1.9425334930419922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22921039909124374, "step": 3728 }, { "epoch": 0.0746, "grad_norm": 2.71875, "grad_norm_var": 0.019090779622395835, "learning_rate": 0.0001, "loss": 4.9306, "loss/crossentropy": 2.1233898997306824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26014500856399536, "step": 3730 }, { "epoch": 0.07464, "grad_norm": 2.609375, "grad_norm_var": 0.023566691080729167, "learning_rate": 0.0001, "loss": 4.9958, "loss/crossentropy": 2.3929240703582764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2791339308023453, "step": 3732 }, { "epoch": 0.07468, "grad_norm": 2.546875, "grad_norm_var": 0.021370442708333333, "learning_rate": 0.0001, "loss": 4.6072, "loss/crossentropy": 2.163137674331665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26793332397937775, "step": 3734 }, { "epoch": 0.07472, "grad_norm": 2.453125, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.723, "loss/crossentropy": 2.1300129294395447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2533607929944992, "step": 3736 }, { "epoch": 0.07476, "grad_norm": 2.5, "grad_norm_var": 0.028055826822916668, "learning_rate": 0.0001, "loss": 4.7232, "loss/crossentropy": 1.9808942675590515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25632843375205994, "step": 3738 }, { "epoch": 0.0748, "grad_norm": 2.453125, "grad_norm_var": 0.03328348795572917, "learning_rate": 0.0001, "loss": 4.8219, "loss/crossentropy": 2.161437451839447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2529585212469101, "step": 3740 }, { "epoch": 0.07484, "grad_norm": 2.53125, "grad_norm_var": 0.03369140625, "learning_rate": 0.0001, "loss": 4.6541, "loss/crossentropy": 1.852737545967102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333993762731552, "step": 3742 }, { "epoch": 0.07488, "grad_norm": 2.265625, "grad_norm_var": 0.03439127604166667, "learning_rate": 0.0001, "loss": 4.4355, "loss/crossentropy": 1.664733350276947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20309723168611526, "step": 3744 }, { "epoch": 0.07492, "grad_norm": 2.203125, "grad_norm_var": 0.03276265462239583, "learning_rate": 0.0001, "loss": 4.4554, "loss/crossentropy": 2.1815799474716187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2470541000366211, "step": 3746 }, { "epoch": 0.07496, "grad_norm": 2.421875, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 4.7423, "loss/crossentropy": 1.9546562433242798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2685271203517914, "step": 3748 }, { "epoch": 0.075, "grad_norm": 2.25, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.5171, "loss/crossentropy": 1.920817255973816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2558089941740036, "step": 3750 }, { "epoch": 0.07504, "grad_norm": 2.265625, "grad_norm_var": 0.024982706705729166, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 1.7570490837097168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265816479921341, "step": 3752 }, { "epoch": 0.07508, "grad_norm": 2.890625, "grad_norm_var": 0.203564453125, "learning_rate": 0.0001, "loss": 4.756, "loss/crossentropy": 2.1815105676651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536233216524124, "step": 3754 }, { "epoch": 0.07512, "grad_norm": 2.4375, "grad_norm_var": 0.19650777180989584, "learning_rate": 0.0001, "loss": 4.7178, "loss/crossentropy": 2.1385504603385925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445325404405594, "step": 3756 }, { "epoch": 0.07516, "grad_norm": 2.34375, "grad_norm_var": 0.19650777180989584, "learning_rate": 0.0001, "loss": 4.6449, "loss/crossentropy": 1.7325092554092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419673353433609, "step": 3758 }, { "epoch": 0.0752, "grad_norm": 2.21875, "grad_norm_var": 0.19572652180989583, "learning_rate": 0.0001, "loss": 4.6096, "loss/crossentropy": 2.358627676963806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713842839002609, "step": 3760 }, { "epoch": 0.07524, "grad_norm": 3.078125, "grad_norm_var": 0.20392252604166666, "learning_rate": 0.0001, "loss": 5.1712, "loss/crossentropy": 2.048672080039978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3007172644138336, "step": 3762 }, { "epoch": 0.07528, "grad_norm": 2.609375, "grad_norm_var": 0.19885660807291666, "learning_rate": 0.0001, "loss": 4.8473, "loss/crossentropy": 2.2967183589935303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28548331558704376, "step": 3764 }, { "epoch": 0.07532, "grad_norm": 2.3125, "grad_norm_var": 0.19228108723958334, "learning_rate": 0.0001, "loss": 4.7541, "loss/crossentropy": 2.1280438899993896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2825516611337662, "step": 3766 }, { "epoch": 0.07536, "grad_norm": 2.375, "grad_norm_var": 0.19146728515625, "learning_rate": 0.0001, "loss": 4.8404, "loss/crossentropy": 2.5528002977371216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28681764006614685, "step": 3768 }, { "epoch": 0.0754, "grad_norm": 2.546875, "grad_norm_var": 0.0549468994140625, "learning_rate": 0.0001, "loss": 4.729, "loss/crossentropy": 2.235885262489319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259520560503006, "step": 3770 }, { "epoch": 0.07544, "grad_norm": 2.421875, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 4.4705, "loss/crossentropy": 1.8836966753005981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360890954732895, "step": 3772 }, { "epoch": 0.07548, "grad_norm": 2.6875, "grad_norm_var": 0.056538899739583336, "learning_rate": 0.0001, "loss": 4.9291, "loss/crossentropy": 2.3396376371383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30392636358737946, "step": 3774 }, { "epoch": 0.07552, "grad_norm": 2.359375, "grad_norm_var": 0.051301066080729166, "learning_rate": 0.0001, "loss": 4.7518, "loss/crossentropy": 2.4024877548217773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2858506590127945, "step": 3776 }, { "epoch": 0.07556, "grad_norm": 2.25, "grad_norm_var": 0.0353515625, "learning_rate": 0.0001, "loss": 4.4073, "loss/crossentropy": 2.138229727745056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25458595901727676, "step": 3778 }, { "epoch": 0.0756, "grad_norm": 2.328125, "grad_norm_var": 0.0359375, "learning_rate": 0.0001, "loss": 4.3882, "loss/crossentropy": 1.8413254618644714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23022352159023285, "step": 3780 }, { "epoch": 0.07564, "grad_norm": 2.390625, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 4.613, "loss/crossentropy": 1.8554572463035583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25644390285015106, "step": 3782 }, { "epoch": 0.07568, "grad_norm": 2.34375, "grad_norm_var": 0.018452962239583332, "learning_rate": 0.0001, "loss": 4.7013, "loss/crossentropy": 2.0096731781959534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25370142608880997, "step": 3784 }, { "epoch": 0.07572, "grad_norm": 2.65625, "grad_norm_var": 0.020018513997395834, "learning_rate": 0.0001, "loss": 4.9317, "loss/crossentropy": 1.7932413220405579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2426912784576416, "step": 3786 }, { "epoch": 0.07576, "grad_norm": 2.125, "grad_norm_var": 0.0263336181640625, "learning_rate": 0.0001, "loss": 4.1599, "loss/crossentropy": 2.0372042655944824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24203844368457794, "step": 3788 }, { "epoch": 0.0758, "grad_norm": 2.296875, "grad_norm_var": 0.021773274739583334, "learning_rate": 0.0001, "loss": 4.3627, "loss/crossentropy": 1.8986076712608337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24614746868610382, "step": 3790 }, { "epoch": 0.07584, "grad_norm": 2.234375, "grad_norm_var": 0.022135416666666668, "learning_rate": 0.0001, "loss": 4.563, "loss/crossentropy": 1.8080393075942993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23206676542758942, "step": 3792 }, { "epoch": 0.07588, "grad_norm": 2.25, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 4.6857, "loss/crossentropy": 1.7578041553497314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21686340868473053, "step": 3794 }, { "epoch": 0.07592, "grad_norm": 2.40625, "grad_norm_var": 0.015087890625, "learning_rate": 0.0001, "loss": 4.4938, "loss/crossentropy": 2.0115376710891724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2651440501213074, "step": 3796 }, { "epoch": 0.07596, "grad_norm": 2.40625, "grad_norm_var": 0.015803019205729168, "learning_rate": 0.0001, "loss": 4.598, "loss/crossentropy": 2.028555393218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26161982119083405, "step": 3798 }, { "epoch": 0.076, "grad_norm": 2.515625, "grad_norm_var": 0.016429646809895834, "learning_rate": 0.0001, "loss": 4.8315, "loss/crossentropy": 2.158663272857666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28110067546367645, "step": 3800 }, { "epoch": 0.07604, "grad_norm": 2.734375, "grad_norm_var": 0.019652303059895834, "learning_rate": 0.0001, "loss": 5.2376, "loss/crossentropy": 2.2959556579589844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28477419912815094, "step": 3802 }, { "epoch": 0.07608, "grad_norm": 2.296875, "grad_norm_var": 0.014997355143229167, "learning_rate": 0.0001, "loss": 4.5289, "loss/crossentropy": 2.149766206741333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25216003507375717, "step": 3804 }, { "epoch": 0.07612, "grad_norm": 2.390625, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 4.7496, "loss/crossentropy": 1.9866302609443665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26193149387836456, "step": 3806 }, { "epoch": 0.07616, "grad_norm": 2.46875, "grad_norm_var": 0.0122955322265625, "learning_rate": 0.0001, "loss": 4.9968, "loss/crossentropy": 2.4230403900146484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27897247672080994, "step": 3808 }, { "epoch": 0.0762, "grad_norm": 2.65625, "grad_norm_var": 0.012613932291666666, "learning_rate": 0.0001, "loss": 4.9133, "loss/crossentropy": 2.2995522022247314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27623192965984344, "step": 3810 }, { "epoch": 0.07624, "grad_norm": 2.484375, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.6632, "loss/crossentropy": 2.167468547821045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26121728122234344, "step": 3812 }, { "epoch": 0.07628, "grad_norm": 2.5, "grad_norm_var": 0.013353474934895833, "learning_rate": 0.0001, "loss": 4.8435, "loss/crossentropy": 2.3259944915771484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29026439785957336, "step": 3814 }, { "epoch": 0.07632, "grad_norm": 2.46875, "grad_norm_var": 0.041825358072916666, "learning_rate": 0.0001, "loss": 4.8704, "loss/crossentropy": 2.18080472946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550061345100403, "step": 3816 }, { "epoch": 0.07636, "grad_norm": 2.390625, "grad_norm_var": 0.03871968587239583, "learning_rate": 0.0001, "loss": 4.5681, "loss/crossentropy": 2.1185330748558044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25014883279800415, "step": 3818 }, { "epoch": 0.0764, "grad_norm": 2.28125, "grad_norm_var": 0.039383951822916666, "learning_rate": 0.0001, "loss": 4.5776, "loss/crossentropy": 1.9028193354606628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22502654790878296, "step": 3820 }, { "epoch": 0.07644, "grad_norm": 2.640625, "grad_norm_var": 0.039713541666666664, "learning_rate": 0.0001, "loss": 5.0188, "loss/crossentropy": 2.266402840614319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24728389829397202, "step": 3822 }, { "epoch": 0.07648, "grad_norm": 2.546875, "grad_norm_var": 0.04279683430989583, "learning_rate": 0.0001, "loss": 4.7036, "loss/crossentropy": 2.0918440222740173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2669295519590378, "step": 3824 }, { "epoch": 0.07652, "grad_norm": 2.296875, "grad_norm_var": 0.04456380208333333, "learning_rate": 0.0001, "loss": 4.1446, "loss/crossentropy": 1.6120481491088867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21910656243562698, "step": 3826 }, { "epoch": 0.07656, "grad_norm": 2.609375, "grad_norm_var": 0.044331868489583336, "learning_rate": 0.0001, "loss": 4.6352, "loss/crossentropy": 2.2294809818267822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2720891535282135, "step": 3828 }, { "epoch": 0.0766, "grad_norm": 2.765625, "grad_norm_var": 0.0524322509765625, "learning_rate": 0.0001, "loss": 4.873, "loss/crossentropy": 2.2588730454444885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613115608692169, "step": 3830 }, { "epoch": 0.07664, "grad_norm": 2.28125, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 4.5881, "loss/crossentropy": 2.2658292055130005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729395925998688, "step": 3832 }, { "epoch": 0.07668, "grad_norm": 2.484375, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 5.1787, "loss/crossentropy": 2.314574718475342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28539060056209564, "step": 3834 }, { "epoch": 0.07672, "grad_norm": 2.546875, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 5.0388, "loss/crossentropy": 2.4684417247772217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29843954741954803, "step": 3836 }, { "epoch": 0.07676, "grad_norm": 2.546875, "grad_norm_var": 0.025944010416666666, "learning_rate": 0.0001, "loss": 4.5976, "loss/crossentropy": 2.528733253479004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2696636766195297, "step": 3838 }, { "epoch": 0.0768, "grad_norm": 2.390625, "grad_norm_var": 0.025340779622395834, "learning_rate": 0.0001, "loss": 4.6449, "loss/crossentropy": 2.203901529312134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26598773896694183, "step": 3840 }, { "epoch": 0.07684, "grad_norm": 2.5625, "grad_norm_var": 0.021907552083333334, "learning_rate": 0.0001, "loss": 4.7322, "loss/crossentropy": 1.9192892909049988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25631098449230194, "step": 3842 }, { "epoch": 0.07688, "grad_norm": 2.3125, "grad_norm_var": 0.022077433268229165, "learning_rate": 0.0001, "loss": 4.6372, "loss/crossentropy": 1.8314838409423828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22842589765787125, "step": 3844 }, { "epoch": 0.07692, "grad_norm": 2.3125, "grad_norm_var": 0.013395182291666667, "learning_rate": 0.0001, "loss": 4.6023, "loss/crossentropy": 2.2416744232177734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24890189617872238, "step": 3846 }, { "epoch": 0.07696, "grad_norm": 2.46875, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 4.8454, "loss/crossentropy": 2.034749209880829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29580502212047577, "step": 3848 }, { "epoch": 0.077, "grad_norm": 2.4375, "grad_norm_var": 0.0117095947265625, "learning_rate": 0.0001, "loss": 4.5923, "loss/crossentropy": 1.9982805848121643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2618003487586975, "step": 3850 }, { "epoch": 0.07704, "grad_norm": 2.59375, "grad_norm_var": 0.0127105712890625, "learning_rate": 0.0001, "loss": 4.7704, "loss/crossentropy": 2.065816104412079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2571987137198448, "step": 3852 }, { "epoch": 0.07708, "grad_norm": 2.46875, "grad_norm_var": 0.0100982666015625, "learning_rate": 0.0001, "loss": 4.7493, "loss/crossentropy": 1.933334231376648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24824900180101395, "step": 3854 }, { "epoch": 0.07712, "grad_norm": 2.28125, "grad_norm_var": 0.010514322916666667, "learning_rate": 0.0001, "loss": 4.5805, "loss/crossentropy": 1.9197405576705933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2559673935174942, "step": 3856 }, { "epoch": 0.07716, "grad_norm": 2.3125, "grad_norm_var": 0.010252888997395833, "learning_rate": 0.0001, "loss": 4.429, "loss/crossentropy": 2.307250142097473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613677680492401, "step": 3858 }, { "epoch": 0.0772, "grad_norm": 2.28125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.4494, "loss/crossentropy": 2.1120635271072388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24830741435289383, "step": 3860 }, { "epoch": 0.07724, "grad_norm": 2.484375, "grad_norm_var": 0.039013671875, "learning_rate": 0.0001, "loss": 4.8585, "loss/crossentropy": 2.404169201850891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2960711419582367, "step": 3862 }, { "epoch": 0.07728, "grad_norm": 2.421875, "grad_norm_var": 0.038899739583333336, "learning_rate": 0.0001, "loss": 5.0257, "loss/crossentropy": 2.2490307688713074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2569812461733818, "step": 3864 }, { "epoch": 0.07732, "grad_norm": 2.34375, "grad_norm_var": 0.0386871337890625, "learning_rate": 0.0001, "loss": 4.9086, "loss/crossentropy": 2.0773178339004517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505042105913162, "step": 3866 }, { "epoch": 0.07736, "grad_norm": 2.40625, "grad_norm_var": 0.038899739583333336, "learning_rate": 0.0001, "loss": 4.4621, "loss/crossentropy": 1.83626389503479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22876176983118057, "step": 3868 }, { "epoch": 0.0774, "grad_norm": 2.34375, "grad_norm_var": 0.03911031087239583, "learning_rate": 0.0001, "loss": 4.5298, "loss/crossentropy": 1.8159971833229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22787011414766312, "step": 3870 }, { "epoch": 0.07744, "grad_norm": 2.453125, "grad_norm_var": 0.0378326416015625, "learning_rate": 0.0001, "loss": 4.5995, "loss/crossentropy": 2.0361026525497437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23802263289690018, "step": 3872 }, { "epoch": 0.07748, "grad_norm": 2.703125, "grad_norm_var": 0.04433492024739583, "learning_rate": 0.0001, "loss": 4.9506, "loss/crossentropy": 2.2464375495910645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26598919928073883, "step": 3874 }, { "epoch": 0.07752, "grad_norm": 2.453125, "grad_norm_var": 0.0408355712890625, "learning_rate": 0.0001, "loss": 4.9416, "loss/crossentropy": 2.163287401199341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29068198800086975, "step": 3876 }, { "epoch": 0.07756, "grad_norm": 2.5, "grad_norm_var": 0.22014567057291667, "learning_rate": 0.0001, "loss": 4.9478, "loss/crossentropy": 2.1638875007629395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2609352171421051, "step": 3878 }, { "epoch": 0.0776, "grad_norm": 2.703125, "grad_norm_var": 0.21923421223958334, "learning_rate": 0.0001, "loss": 4.6247, "loss/crossentropy": 1.9216270446777344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25062238425016403, "step": 3880 }, { "epoch": 0.07764, "grad_norm": 2.828125, "grad_norm_var": 0.21585184733072918, "learning_rate": 0.0001, "loss": 5.0786, "loss/crossentropy": 2.036958694458008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2897229939699173, "step": 3882 }, { "epoch": 0.07768, "grad_norm": 2.5625, "grad_norm_var": 0.2094635009765625, "learning_rate": 0.0001, "loss": 4.6062, "loss/crossentropy": 2.1493492126464844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.258526012301445, "step": 3884 }, { "epoch": 0.07772, "grad_norm": 2.484375, "grad_norm_var": 0.20414937337239583, "learning_rate": 0.0001, "loss": 4.468, "loss/crossentropy": 2.0496288537979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2386137992143631, "step": 3886 }, { "epoch": 0.07776, "grad_norm": 2.40625, "grad_norm_var": 0.20829671223958332, "learning_rate": 0.0001, "loss": 4.2402, "loss/crossentropy": 1.5763422846794128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227308303117752, "step": 3888 }, { "epoch": 0.0778, "grad_norm": 2.640625, "grad_norm_var": 0.20852864583333333, "learning_rate": 0.0001, "loss": 5.0582, "loss/crossentropy": 2.5032416582107544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3047761619091034, "step": 3890 }, { "epoch": 0.07784, "grad_norm": 2.375, "grad_norm_var": 0.21199544270833334, "learning_rate": 0.0001, "loss": 4.7694, "loss/crossentropy": 2.3609601259231567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27848154306411743, "step": 3892 }, { "epoch": 0.07788, "grad_norm": 2.328125, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 4.776, "loss/crossentropy": 2.188078999519348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2570301741361618, "step": 3894 }, { "epoch": 0.07792, "grad_norm": 2.546875, "grad_norm_var": 0.018733723958333334, "learning_rate": 0.0001, "loss": 4.8374, "loss/crossentropy": 1.9860637784004211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25376833230257034, "step": 3896 }, { "epoch": 0.07796, "grad_norm": 2.234375, "grad_norm_var": 0.010399373372395833, "learning_rate": 0.0001, "loss": 4.404, "loss/crossentropy": 2.0886037945747375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24766233563423157, "step": 3898 }, { "epoch": 0.078, "grad_norm": 2.3125, "grad_norm_var": 0.009845987955729166, "learning_rate": 0.0001, "loss": 4.6833, "loss/crossentropy": 2.373010039329529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27916407585144043, "step": 3900 }, { "epoch": 0.07804, "grad_norm": 2.359375, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 4.6295, "loss/crossentropy": 1.6733890771865845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211885154247284, "step": 3902 }, { "epoch": 0.07808, "grad_norm": 2.46875, "grad_norm_var": 0.010184733072916667, "learning_rate": 0.0001, "loss": 4.6588, "loss/crossentropy": 2.0506675243377686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27091431617736816, "step": 3904 }, { "epoch": 0.07812, "grad_norm": 2.28125, "grad_norm_var": 0.008784993489583334, "learning_rate": 0.0001, "loss": 4.712, "loss/crossentropy": 2.3200724124908447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26983049511909485, "step": 3906 }, { "epoch": 0.07816, "grad_norm": 2.765625, "grad_norm_var": 0.0164459228515625, "learning_rate": 0.0001, "loss": 4.7171, "loss/crossentropy": 1.928814709186554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2423655241727829, "step": 3908 }, { "epoch": 0.0782, "grad_norm": 2.40625, "grad_norm_var": 0.016389973958333335, "learning_rate": 0.0001, "loss": 4.6944, "loss/crossentropy": 2.007555842399597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2444767728447914, "step": 3910 }, { "epoch": 0.07824, "grad_norm": 2.875, "grad_norm_var": 0.027469889322916666, "learning_rate": 0.0001, "loss": 4.7955, "loss/crossentropy": 2.2054057121276855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2849784791469574, "step": 3912 }, { "epoch": 0.07828, "grad_norm": 2.53125, "grad_norm_var": 0.03986002604166667, "learning_rate": 0.0001, "loss": 4.6885, "loss/crossentropy": 2.331532597541809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2820790112018585, "step": 3914 }, { "epoch": 0.07832, "grad_norm": 2.40625, "grad_norm_var": 0.036519368489583336, "learning_rate": 0.0001, "loss": 4.6967, "loss/crossentropy": 2.142041563987732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25799722969532013, "step": 3916 }, { "epoch": 0.07836, "grad_norm": 2.625, "grad_norm_var": 0.0394683837890625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 2.010735809803009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24202881753444672, "step": 3918 }, { "epoch": 0.0784, "grad_norm": 2.4375, "grad_norm_var": 0.03970947265625, "learning_rate": 0.0001, "loss": 5.037, "loss/crossentropy": 2.382808804512024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27038049697875977, "step": 3920 }, { "epoch": 0.07844, "grad_norm": 2.375, "grad_norm_var": 0.0372955322265625, "learning_rate": 0.0001, "loss": 4.7708, "loss/crossentropy": 2.099658191204071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557126358151436, "step": 3922 }, { "epoch": 0.07848, "grad_norm": 2.375, "grad_norm_var": 0.03968098958333333, "learning_rate": 0.0001, "loss": 4.3775, "loss/crossentropy": 1.7840275764465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22102414071559906, "step": 3924 }, { "epoch": 0.07852, "grad_norm": 2.5, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 4.6637, "loss/crossentropy": 1.8730725049972534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23219943791627884, "step": 3926 }, { "epoch": 0.07856, "grad_norm": 2.203125, "grad_norm_var": 0.034520467122395836, "learning_rate": 0.0001, "loss": 4.4432, "loss/crossentropy": 1.9218623638153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23333143442869186, "step": 3928 }, { "epoch": 0.0786, "grad_norm": 2.46875, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 4.8944, "loss/crossentropy": 1.9885727763175964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2596626430749893, "step": 3930 }, { "epoch": 0.07864, "grad_norm": 2.390625, "grad_norm_var": 0.014989217122395834, "learning_rate": 0.0001, "loss": 4.9358, "loss/crossentropy": 2.397018015384674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2790713906288147, "step": 3932 }, { "epoch": 0.07868, "grad_norm": 2.1875, "grad_norm_var": 0.014383951822916666, "learning_rate": 0.0001, "loss": 4.3062, "loss/crossentropy": 1.7345170378684998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23668452352285385, "step": 3934 }, { "epoch": 0.07872, "grad_norm": 2.25, "grad_norm_var": 0.013309733072916666, "learning_rate": 0.0001, "loss": 4.6173, "loss/crossentropy": 1.8630162477493286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2371346428990364, "step": 3936 }, { "epoch": 0.07876, "grad_norm": 2.40625, "grad_norm_var": 0.010367838541666667, "learning_rate": 0.0001, "loss": 4.5774, "loss/crossentropy": 2.0738128423690796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24340355396270752, "step": 3938 }, { "epoch": 0.0788, "grad_norm": 2.40625, "grad_norm_var": 0.010835774739583333, "learning_rate": 0.0001, "loss": 5.0027, "loss/crossentropy": 2.2932467460632324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527567446231842, "step": 3940 }, { "epoch": 0.07884, "grad_norm": 2.46875, "grad_norm_var": 0.009956868489583333, "learning_rate": 0.0001, "loss": 4.6785, "loss/crossentropy": 2.0000113248825073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23461253196001053, "step": 3942 }, { "epoch": 0.07888, "grad_norm": 2.296875, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 4.7307, "loss/crossentropy": 2.0753955841064453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535083442926407, "step": 3944 }, { "epoch": 0.07892, "grad_norm": 2.40625, "grad_norm_var": 0.008153279622395834, "learning_rate": 0.0001, "loss": 4.6906, "loss/crossentropy": 2.167261242866516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24734552949666977, "step": 3946 }, { "epoch": 0.07896, "grad_norm": 2.578125, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.7347, "loss/crossentropy": 1.9755831956863403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24861737340688705, "step": 3948 }, { "epoch": 0.079, "grad_norm": 2.484375, "grad_norm_var": 0.009261067708333333, "learning_rate": 0.0001, "loss": 4.6813, "loss/crossentropy": 2.195417881011963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2518697530031204, "step": 3950 }, { "epoch": 0.07904, "grad_norm": 2.828125, "grad_norm_var": 0.01920166015625, "learning_rate": 0.0001, "loss": 5.0172, "loss/crossentropy": 2.5771371126174927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28423887491226196, "step": 3952 }, { "epoch": 0.07908, "grad_norm": 2.375, "grad_norm_var": 0.017870076497395835, "learning_rate": 0.0001, "loss": 4.7071, "loss/crossentropy": 1.683276355266571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075405865907669, "step": 3954 }, { "epoch": 0.07912, "grad_norm": 2.515625, "grad_norm_var": 0.018993123372395834, "learning_rate": 0.0001, "loss": 4.7128, "loss/crossentropy": 2.2983756065368652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28010235726833344, "step": 3956 }, { "epoch": 0.07916, "grad_norm": 2.421875, "grad_norm_var": 0.020361328125, "learning_rate": 0.0001, "loss": 4.7896, "loss/crossentropy": 2.3263272047042847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28625819087028503, "step": 3958 }, { "epoch": 0.0792, "grad_norm": 2.40625, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 4.5201, "loss/crossentropy": 1.9820871353149414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2594129145145416, "step": 3960 }, { "epoch": 0.07924, "grad_norm": 2.171875, "grad_norm_var": 0.025634765625, "learning_rate": 0.0001, "loss": 4.4754, "loss/crossentropy": 1.8991515636444092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23694587498903275, "step": 3962 }, { "epoch": 0.07928, "grad_norm": 2.921875, "grad_norm_var": 0.03843994140625, "learning_rate": 0.0001, "loss": 4.9865, "loss/crossentropy": 2.485508918762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2957809865474701, "step": 3964 }, { "epoch": 0.07932, "grad_norm": 2.390625, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 4.8871, "loss/crossentropy": 2.156081974506378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27474651485681534, "step": 3966 }, { "epoch": 0.07936, "grad_norm": 2.5625, "grad_norm_var": 0.028544108072916668, "learning_rate": 0.0001, "loss": 4.8694, "loss/crossentropy": 2.0370571613311768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2732074484229088, "step": 3968 }, { "epoch": 0.0794, "grad_norm": 2.421875, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 5.062, "loss/crossentropy": 2.3039989471435547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31054411828517914, "step": 3970 }, { "epoch": 0.07944, "grad_norm": 2.34375, "grad_norm_var": 0.028641764322916666, "learning_rate": 0.0001, "loss": 4.7875, "loss/crossentropy": 2.2280107736587524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2711277902126312, "step": 3972 }, { "epoch": 0.07948, "grad_norm": 2.625, "grad_norm_var": 0.02935791015625, "learning_rate": 0.0001, "loss": 4.8992, "loss/crossentropy": 2.0609869956970215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24142058193683624, "step": 3974 }, { "epoch": 0.07952, "grad_norm": 2.296875, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 4.1391, "loss/crossentropy": 2.0541876554489136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24537865817546844, "step": 3976 }, { "epoch": 0.07956, "grad_norm": 2.578125, "grad_norm_var": 0.6102701822916666, "learning_rate": 0.0001, "loss": 4.9591, "loss/crossentropy": 2.344236969947815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2770863175392151, "step": 3978 }, { "epoch": 0.0796, "grad_norm": 2.625, "grad_norm_var": 0.65465087890625, "learning_rate": 0.0001, "loss": 4.7232, "loss/crossentropy": 1.7899338603019714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23951984196901321, "step": 3980 }, { "epoch": 0.07964, "grad_norm": 2.328125, "grad_norm_var": 0.6591471354166667, "learning_rate": 0.0001, "loss": 4.6771, "loss/crossentropy": 2.3253976106643677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2691802680492401, "step": 3982 }, { "epoch": 0.07968, "grad_norm": 2.453125, "grad_norm_var": 0.65885009765625, "learning_rate": 0.0001, "loss": 4.8223, "loss/crossentropy": 2.14141583442688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2603989467024803, "step": 3984 }, { "epoch": 0.07972, "grad_norm": 2.53125, "grad_norm_var": 0.6512196858723959, "learning_rate": 0.0001, "loss": 4.9059, "loss/crossentropy": 2.262465476989746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.291474387049675, "step": 3986 }, { "epoch": 0.07976, "grad_norm": 2.328125, "grad_norm_var": 0.66064453125, "learning_rate": 0.0001, "loss": 4.5626, "loss/crossentropy": 2.1835561990737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23615659773349762, "step": 3988 }, { "epoch": 0.0798, "grad_norm": 2.359375, "grad_norm_var": 0.6716145833333333, "learning_rate": 0.0001, "loss": 4.8482, "loss/crossentropy": 2.020140767097473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2593151703476906, "step": 3990 }, { "epoch": 0.07984, "grad_norm": 2.59375, "grad_norm_var": 0.6518513997395833, "learning_rate": 0.0001, "loss": 4.7986, "loss/crossentropy": 2.277661681175232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2721617519855499, "step": 3992 }, { "epoch": 0.07988, "grad_norm": 2.34375, "grad_norm_var": 0.09846089680989584, "learning_rate": 0.0001, "loss": 4.5113, "loss/crossentropy": 2.1883193254470825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2610222101211548, "step": 3994 }, { "epoch": 0.07992, "grad_norm": 2.421875, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 4.5987, "loss/crossentropy": 2.152850031852722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27418845891952515, "step": 3996 }, { "epoch": 0.07996, "grad_norm": 2.546875, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 4.8214, "loss/crossentropy": 2.3313716650009155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27529503405094147, "step": 3998 }, { "epoch": 0.08, "grad_norm": 2.640625, "grad_norm_var": 0.020319620768229168, "learning_rate": 0.0001, "loss": 5.086, "loss/crossentropy": 2.250498414039612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26368021965026855, "step": 4000 }, { "epoch": 0.08004, "grad_norm": 2.4375, "grad_norm_var": 0.011188761393229166, "learning_rate": 0.0001, "loss": 4.8005, "loss/crossentropy": 2.322459101676941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2868216335773468, "step": 4002 }, { "epoch": 0.08008, "grad_norm": 2.40625, "grad_norm_var": 0.0142730712890625, "learning_rate": 0.0001, "loss": 4.8693, "loss/crossentropy": 1.9340506792068481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316983863711357, "step": 4004 }, { "epoch": 0.08012, "grad_norm": 2.359375, "grad_norm_var": 0.0150054931640625, "learning_rate": 0.0001, "loss": 4.7395, "loss/crossentropy": 1.8635645508766174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22694773972034454, "step": 4006 }, { "epoch": 0.08016, "grad_norm": 10.375, "grad_norm_var": 3.9615631103515625, "learning_rate": 0.0001, "loss": 4.8916, "loss/crossentropy": 1.9252317547798157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24893560260534286, "step": 4008 }, { "epoch": 0.0802, "grad_norm": 2.671875, "grad_norm_var": 3.9093424479166665, "learning_rate": 0.0001, "loss": 5.2636, "loss/crossentropy": 2.1964328289031982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567693591117859, "step": 4010 }, { "epoch": 0.08024, "grad_norm": 2.671875, "grad_norm_var": 3.8960113525390625, "learning_rate": 0.0001, "loss": 4.9054, "loss/crossentropy": 2.296012043952942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27360107749700546, "step": 4012 }, { "epoch": 0.08028, "grad_norm": 2.453125, "grad_norm_var": 3.9009724934895833, "learning_rate": 0.0001, "loss": 4.8894, "loss/crossentropy": 2.360015869140625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27541905641555786, "step": 4014 }, { "epoch": 0.08032, "grad_norm": 2.390625, "grad_norm_var": 3.906591796875, "learning_rate": 0.0001, "loss": 4.8865, "loss/crossentropy": 2.36370050907135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27881547808647156, "step": 4016 }, { "epoch": 0.08036, "grad_norm": 2.25, "grad_norm_var": 3.9068593343098956, "learning_rate": 0.0001, "loss": 4.6461, "loss/crossentropy": 1.8704780340194702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2512069493532181, "step": 4018 }, { "epoch": 0.0804, "grad_norm": 2.4375, "grad_norm_var": 3.9156483968098956, "learning_rate": 0.0001, "loss": 4.733, "loss/crossentropy": 2.1989234685897827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2755106985569, "step": 4020 }, { "epoch": 0.08044, "grad_norm": 2.21875, "grad_norm_var": 3.9420237223307293, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 1.9905433058738708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2579897418618202, "step": 4022 }, { "epoch": 0.08048, "grad_norm": 2.515625, "grad_norm_var": 0.08837788899739583, "learning_rate": 0.0001, "loss": 4.9025, "loss/crossentropy": 2.270000696182251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2693728804588318, "step": 4024 }, { "epoch": 0.08052, "grad_norm": 2.375, "grad_norm_var": 0.08504130045572916, "learning_rate": 0.0001, "loss": 4.7569, "loss/crossentropy": 2.178301692008972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2617819905281067, "step": 4026 }, { "epoch": 0.08056, "grad_norm": 2.25, "grad_norm_var": 0.08346354166666667, "learning_rate": 0.0001, "loss": 4.687, "loss/crossentropy": 2.518654465675354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29282887279987335, "step": 4028 }, { "epoch": 0.0806, "grad_norm": 2.25, "grad_norm_var": 0.08859049479166667, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 1.9181422591209412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23210199177265167, "step": 4030 }, { "epoch": 0.08064, "grad_norm": 2.359375, "grad_norm_var": 0.08816630045572917, "learning_rate": 0.0001, "loss": 4.6407, "loss/crossentropy": 2.343222141265869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28908614814281464, "step": 4032 }, { "epoch": 0.08068, "grad_norm": 2.390625, "grad_norm_var": 0.0302886962890625, "learning_rate": 0.0001, "loss": 4.6879, "loss/crossentropy": 2.0816845893859863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2571762502193451, "step": 4034 }, { "epoch": 0.08072, "grad_norm": 2.296875, "grad_norm_var": 0.029878743489583335, "learning_rate": 0.0001, "loss": 4.3324, "loss/crossentropy": 1.9679544568061829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316160574555397, "step": 4036 }, { "epoch": 0.08076, "grad_norm": 2.328125, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 4.7335, "loss/crossentropy": 2.1553682684898376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25725623965263367, "step": 4038 }, { "epoch": 0.0808, "grad_norm": 2.421875, "grad_norm_var": 0.004964192708333333, "learning_rate": 0.0001, "loss": 4.6892, "loss/crossentropy": 2.0269790291786194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26298412680625916, "step": 4040 }, { "epoch": 0.08084, "grad_norm": 2.484375, "grad_norm_var": 0.0060943603515625, "learning_rate": 0.0001, "loss": 4.6686, "loss/crossentropy": 1.984773874282837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24288517236709595, "step": 4042 }, { "epoch": 0.08088, "grad_norm": 2.3125, "grad_norm_var": 0.0051910400390625, "learning_rate": 0.0001, "loss": 4.9282, "loss/crossentropy": 2.178356111049652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2684163451194763, "step": 4044 }, { "epoch": 0.08092, "grad_norm": 2.46875, "grad_norm_var": 0.0053670247395833336, "learning_rate": 0.0001, "loss": 4.8191, "loss/crossentropy": 2.235984683036804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26377667486667633, "step": 4046 }, { "epoch": 0.08096, "grad_norm": 2.34375, "grad_norm_var": 0.005826822916666667, "learning_rate": 0.0001, "loss": 4.7026, "loss/crossentropy": 2.085321545600891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23403701931238174, "step": 4048 }, { "epoch": 0.081, "grad_norm": 2.3125, "grad_norm_var": 0.0052642822265625, "learning_rate": 0.0001, "loss": 4.9932, "loss/crossentropy": 2.419228672981262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27236658334732056, "step": 4050 }, { "epoch": 0.08104, "grad_norm": 2.421875, "grad_norm_var": 0.0054972330729166664, "learning_rate": 0.0001, "loss": 4.6105, "loss/crossentropy": 2.153620958328247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28071053326129913, "step": 4052 }, { "epoch": 0.08108, "grad_norm": 2.71875, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 4.8775, "loss/crossentropy": 2.1466477513313293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27491800487041473, "step": 4054 }, { "epoch": 0.08112, "grad_norm": 2.265625, "grad_norm_var": 0.013932291666666667, "learning_rate": 0.0001, "loss": 4.3707, "loss/crossentropy": 2.2020710706710815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25149518996477127, "step": 4056 }, { "epoch": 0.08116, "grad_norm": 2.34375, "grad_norm_var": 0.013623046875, "learning_rate": 0.0001, "loss": 4.8458, "loss/crossentropy": 2.264205574989319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27612583339214325, "step": 4058 }, { "epoch": 0.0812, "grad_norm": 2.4375, "grad_norm_var": 0.016434733072916666, "learning_rate": 0.0001, "loss": 4.8644, "loss/crossentropy": 2.269905209541321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692303955554962, "step": 4060 }, { "epoch": 0.08124, "grad_norm": 2.34375, "grad_norm_var": 0.018505859375, "learning_rate": 0.0001, "loss": 4.5057, "loss/crossentropy": 1.920631766319275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22550494968891144, "step": 4062 }, { "epoch": 0.08128, "grad_norm": 2.53125, "grad_norm_var": 0.019749959309895832, "learning_rate": 0.0001, "loss": 5.0796, "loss/crossentropy": 2.307617664337158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22719035297632217, "step": 4064 }, { "epoch": 0.08132, "grad_norm": 2.375, "grad_norm_var": 0.022102864583333333, "learning_rate": 0.0001, "loss": 4.6167, "loss/crossentropy": 2.113444685935974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.240354023873806, "step": 4066 }, { "epoch": 0.08136, "grad_norm": 2.375, "grad_norm_var": 0.02232666015625, "learning_rate": 0.0001, "loss": 4.9152, "loss/crossentropy": 2.4516230821609497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27590544521808624, "step": 4068 }, { "epoch": 0.0814, "grad_norm": 2.34375, "grad_norm_var": 0.08772379557291667, "learning_rate": 0.0001, "loss": 4.5976, "loss/crossentropy": 1.8287339806556702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22503511607646942, "step": 4070 }, { "epoch": 0.08144, "grad_norm": 2.5, "grad_norm_var": 0.08479715983072916, "learning_rate": 0.0001, "loss": 5.1623, "loss/crossentropy": 2.3468997478485107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2773839682340622, "step": 4072 }, { "epoch": 0.08148, "grad_norm": 2.296875, "grad_norm_var": 0.08782145182291666, "learning_rate": 0.0001, "loss": 4.5413, "loss/crossentropy": 2.1307512521743774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24247504770755768, "step": 4074 }, { "epoch": 0.08152, "grad_norm": 2.21875, "grad_norm_var": 0.08982645670572917, "learning_rate": 0.0001, "loss": 4.7447, "loss/crossentropy": 2.248755097389221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2696859538555145, "step": 4076 }, { "epoch": 0.08156, "grad_norm": 2.265625, "grad_norm_var": 0.09045817057291666, "learning_rate": 0.0001, "loss": 4.2948, "loss/crossentropy": 2.0233980417251587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370016872882843, "step": 4078 }, { "epoch": 0.0816, "grad_norm": 2.234375, "grad_norm_var": 0.0923828125, "learning_rate": 0.0001, "loss": 4.432, "loss/crossentropy": 1.9536627531051636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23242933303117752, "step": 4080 }, { "epoch": 0.08164, "grad_norm": 2.375, "grad_norm_var": 0.0889801025390625, "learning_rate": 0.0001, "loss": 4.5037, "loss/crossentropy": 1.9631904363632202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2337196245789528, "step": 4082 }, { "epoch": 0.08168, "grad_norm": 2.546875, "grad_norm_var": 0.08935139973958334, "learning_rate": 0.0001, "loss": 4.7406, "loss/crossentropy": 2.193789482116699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687358558177948, "step": 4084 }, { "epoch": 0.08172, "grad_norm": 2.40625, "grad_norm_var": 0.020210774739583333, "learning_rate": 0.0001, "loss": 4.6454, "loss/crossentropy": 2.308240056037903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2679348289966583, "step": 4086 }, { "epoch": 0.08176, "grad_norm": 2.359375, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 4.9126, "loss/crossentropy": 2.343047261238098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3056950569152832, "step": 4088 }, { "epoch": 0.0818, "grad_norm": 2.234375, "grad_norm_var": 0.022526041666666666, "learning_rate": 0.0001, "loss": 4.7315, "loss/crossentropy": 1.9583085179328918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23911744356155396, "step": 4090 }, { "epoch": 0.08184, "grad_norm": 2.296875, "grad_norm_var": 0.021198527018229166, "learning_rate": 0.0001, "loss": 4.6559, "loss/crossentropy": 2.341569185256958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26053962111473083, "step": 4092 }, { "epoch": 0.08188, "grad_norm": 2.921875, "grad_norm_var": 0.034601847330729164, "learning_rate": 0.0001, "loss": 4.5773, "loss/crossentropy": 2.068669080734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25753986835479736, "step": 4094 }, { "epoch": 0.08192, "grad_norm": 2.625, "grad_norm_var": 0.04309488932291667, "learning_rate": 0.0001, "loss": 5.0253, "loss/crossentropy": 2.1461241841316223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628704681992531, "step": 4096 }, { "epoch": 0.08196, "grad_norm": 2.3125, "grad_norm_var": 0.04810791015625, "learning_rate": 0.0001, "loss": 4.5587, "loss/crossentropy": 2.0718055963516235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24923217296600342, "step": 4098 }, { "epoch": 0.082, "grad_norm": 2.359375, "grad_norm_var": 0.0503082275390625, "learning_rate": 0.0001, "loss": 4.379, "loss/crossentropy": 1.9812004566192627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316955253481865, "step": 4100 }, { "epoch": 0.08204, "grad_norm": 2.3125, "grad_norm_var": 0.0465240478515625, "learning_rate": 0.0001, "loss": 4.7909, "loss/crossentropy": 2.2669100761413574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2806926518678665, "step": 4102 }, { "epoch": 0.08208, "grad_norm": 2.375, "grad_norm_var": 0.043196614583333334, "learning_rate": 0.0001, "loss": 4.7502, "loss/crossentropy": 2.0620261430740356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2610047310590744, "step": 4104 }, { "epoch": 0.08212, "grad_norm": 3.046875, "grad_norm_var": 0.06297098795572917, "learning_rate": 0.0001, "loss": 4.6672, "loss/crossentropy": 2.249971866607666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2783341556787491, "step": 4106 }, { "epoch": 0.08216, "grad_norm": 2.328125, "grad_norm_var": 0.06320699055989583, "learning_rate": 0.0001, "loss": 4.834, "loss/crossentropy": 2.1064823865890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27534550428390503, "step": 4108 }, { "epoch": 0.0822, "grad_norm": 2.28125, "grad_norm_var": 0.05406901041666667, "learning_rate": 0.0001, "loss": 4.3944, "loss/crossentropy": 1.886509656906128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2315160632133484, "step": 4110 }, { "epoch": 0.08224, "grad_norm": 2.40625, "grad_norm_var": 0.039290364583333334, "learning_rate": 0.0001, "loss": 4.2969, "loss/crossentropy": 1.6429635286331177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20593100041151047, "step": 4112 }, { "epoch": 0.08228, "grad_norm": 2.40625, "grad_norm_var": 0.0370025634765625, "learning_rate": 0.0001, "loss": 4.4581, "loss/crossentropy": 2.3236618041992188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2674623131752014, "step": 4114 }, { "epoch": 0.08232, "grad_norm": 2.578125, "grad_norm_var": 0.03658447265625, "learning_rate": 0.0001, "loss": 4.9734, "loss/crossentropy": 2.1479567885398865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25003983825445175, "step": 4116 }, { "epoch": 0.08236, "grad_norm": 2.578125, "grad_norm_var": 0.03611551920572917, "learning_rate": 0.0001, "loss": 5.0477, "loss/crossentropy": 2.140569031238556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24322029948234558, "step": 4118 }, { "epoch": 0.0824, "grad_norm": 2.328125, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 4.6061, "loss/crossentropy": 2.126375436782837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27629759907722473, "step": 4120 }, { "epoch": 0.08244, "grad_norm": 2.28125, "grad_norm_var": 0.015843709309895832, "learning_rate": 0.0001, "loss": 4.9143, "loss/crossentropy": 2.3699214458465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692546397447586, "step": 4122 }, { "epoch": 0.08248, "grad_norm": 2.296875, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 4.5672, "loss/crossentropy": 2.013331353664398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24442073702812195, "step": 4124 }, { "epoch": 0.08252, "grad_norm": 2.359375, "grad_norm_var": 0.010152180989583334, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.1869460344314575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.250150203704834, "step": 4126 }, { "epoch": 0.08256, "grad_norm": 2.203125, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 4.4564, "loss/crossentropy": 2.2725884914398193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2810261696577072, "step": 4128 }, { "epoch": 0.0826, "grad_norm": 2.328125, "grad_norm_var": 0.013688151041666667, "learning_rate": 0.0001, "loss": 4.7311, "loss/crossentropy": 1.9190022945404053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2693525403738022, "step": 4130 }, { "epoch": 0.08264, "grad_norm": 2.65625, "grad_norm_var": 0.016112263997395834, "learning_rate": 0.0001, "loss": 4.7967, "loss/crossentropy": 2.5477795600891113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27040669322013855, "step": 4132 }, { "epoch": 0.08268, "grad_norm": 2.40625, "grad_norm_var": 0.0113677978515625, "learning_rate": 0.0001, "loss": 4.7617, "loss/crossentropy": 2.231198728084564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28101250529289246, "step": 4134 }, { "epoch": 0.08272, "grad_norm": 2.296875, "grad_norm_var": 0.011693318684895834, "learning_rate": 0.0001, "loss": 4.6334, "loss/crossentropy": 2.17776882648468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24743208289146423, "step": 4136 }, { "epoch": 0.08276, "grad_norm": 2.46875, "grad_norm_var": 0.011799112955729166, "learning_rate": 0.0001, "loss": 5.0233, "loss/crossentropy": 2.418373703956604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2954525351524353, "step": 4138 }, { "epoch": 0.0828, "grad_norm": 2.3125, "grad_norm_var": 0.010969034830729167, "learning_rate": 0.0001, "loss": 4.5564, "loss/crossentropy": 2.054605543613434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23863950371742249, "step": 4140 }, { "epoch": 0.08284, "grad_norm": 2.5, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 4.8983, "loss/crossentropy": 2.054013967514038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29851172864437103, "step": 4142 }, { "epoch": 0.08288, "grad_norm": 2.46875, "grad_norm_var": 0.008610026041666666, "learning_rate": 0.0001, "loss": 4.7425, "loss/crossentropy": 2.193961024284363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563214898109436, "step": 4144 }, { "epoch": 0.08292, "grad_norm": 2.40625, "grad_norm_var": 0.008382161458333334, "learning_rate": 0.0001, "loss": 4.7995, "loss/crossentropy": 2.460008382797241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26705074310302734, "step": 4146 }, { "epoch": 0.08296, "grad_norm": 2.3125, "grad_norm_var": 0.005060831705729167, "learning_rate": 0.0001, "loss": 4.894, "loss/crossentropy": 2.508321523666382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26217761635780334, "step": 4148 }, { "epoch": 0.083, "grad_norm": 2.328125, "grad_norm_var": 0.0065582275390625, "learning_rate": 0.0001, "loss": 4.6103, "loss/crossentropy": 1.8445284366607666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23396535962820053, "step": 4150 }, { "epoch": 0.08304, "grad_norm": 2.375, "grad_norm_var": 0.005952962239583333, "learning_rate": 0.0001, "loss": 4.8048, "loss/crossentropy": 2.433600902557373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2825329154729843, "step": 4152 }, { "epoch": 0.08308, "grad_norm": 2.421875, "grad_norm_var": 0.0053212483723958336, "learning_rate": 0.0001, "loss": 4.8632, "loss/crossentropy": 2.386221170425415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2941686511039734, "step": 4154 }, { "epoch": 0.08312, "grad_norm": 2.421875, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 4.7486, "loss/crossentropy": 1.9578949809074402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24851053953170776, "step": 4156 }, { "epoch": 0.08316, "grad_norm": 2.3125, "grad_norm_var": 0.004150390625, "learning_rate": 0.0001, "loss": 4.6443, "loss/crossentropy": 2.0034408569335938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25524984300136566, "step": 4158 }, { "epoch": 0.0832, "grad_norm": 2.28125, "grad_norm_var": 0.0069488525390625, "learning_rate": 0.0001, "loss": 4.6856, "loss/crossentropy": 2.3342589139938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31023962795734406, "step": 4160 }, { "epoch": 0.08324, "grad_norm": 2.46875, "grad_norm_var": 0.042867024739583336, "learning_rate": 0.0001, "loss": 4.6762, "loss/crossentropy": 2.3941839933395386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2641746401786804, "step": 4162 }, { "epoch": 0.08328, "grad_norm": 2.453125, "grad_norm_var": 0.04168294270833333, "learning_rate": 0.0001, "loss": 4.7149, "loss/crossentropy": 2.371219038963318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24197939038276672, "step": 4164 }, { "epoch": 0.08332, "grad_norm": 2.734375, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 4.7949, "loss/crossentropy": 2.133378028869629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25449906289577484, "step": 4166 }, { "epoch": 0.08336, "grad_norm": 2.4375, "grad_norm_var": 0.04496968587239583, "learning_rate": 0.0001, "loss": 4.5974, "loss/crossentropy": 1.7460412979125977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157151699066162, "step": 4168 }, { "epoch": 0.0834, "grad_norm": 2.609375, "grad_norm_var": 0.046873982747395834, "learning_rate": 0.0001, "loss": 4.7234, "loss/crossentropy": 2.215083122253418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2823774367570877, "step": 4170 }, { "epoch": 0.08344, "grad_norm": 2.453125, "grad_norm_var": 0.04820048014322917, "learning_rate": 0.0001, "loss": 4.5189, "loss/crossentropy": 2.0528377890586853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551003098487854, "step": 4172 }, { "epoch": 0.08348, "grad_norm": 2.359375, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 4.4008, "loss/crossentropy": 1.7953855395317078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23185917735099792, "step": 4174 }, { "epoch": 0.08352, "grad_norm": 2.34375, "grad_norm_var": 0.058934529622395836, "learning_rate": 0.0001, "loss": 4.4879, "loss/crossentropy": 2.0794734954833984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23386041820049286, "step": 4176 }, { "epoch": 0.08356, "grad_norm": 2.234375, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 4.3802, "loss/crossentropy": 2.1685845851898193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26074862480163574, "step": 4178 }, { "epoch": 0.0836, "grad_norm": 2.484375, "grad_norm_var": 0.031412760416666664, "learning_rate": 0.0001, "loss": 4.5507, "loss/crossentropy": 2.1495825052261353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2689145505428314, "step": 4180 }, { "epoch": 0.08364, "grad_norm": 2.328125, "grad_norm_var": 0.024967447916666666, "learning_rate": 0.0001, "loss": 4.5258, "loss/crossentropy": 2.043331503868103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2498578578233719, "step": 4182 }, { "epoch": 0.08368, "grad_norm": 2.734375, "grad_norm_var": 0.07787984212239583, "learning_rate": 0.0001, "loss": 5.0198, "loss/crossentropy": 2.04026997089386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2873340845108032, "step": 4184 }, { "epoch": 0.08372, "grad_norm": 2.515625, "grad_norm_var": 0.09128316243489583, "learning_rate": 0.0001, "loss": 4.9537, "loss/crossentropy": 2.4653968811035156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2911294251680374, "step": 4186 }, { "epoch": 0.08376, "grad_norm": 2.421875, "grad_norm_var": 0.09215494791666666, "learning_rate": 0.0001, "loss": 4.6589, "loss/crossentropy": 2.2960848808288574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23984040319919586, "step": 4188 }, { "epoch": 0.0838, "grad_norm": 2.515625, "grad_norm_var": 0.09599202473958333, "learning_rate": 0.0001, "loss": 4.4324, "loss/crossentropy": 2.011807084083557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523125037550926, "step": 4190 }, { "epoch": 0.08384, "grad_norm": 2.421875, "grad_norm_var": 0.0886871337890625, "learning_rate": 0.0001, "loss": 4.8437, "loss/crossentropy": 2.0016889572143555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2291206791996956, "step": 4192 }, { "epoch": 0.08388, "grad_norm": 2.1875, "grad_norm_var": 0.09378255208333333, "learning_rate": 0.0001, "loss": 4.3604, "loss/crossentropy": 1.97197824716568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595779076218605, "step": 4194 }, { "epoch": 0.08392, "grad_norm": 2.34375, "grad_norm_var": 0.09763895670572917, "learning_rate": 0.0001, "loss": 4.5823, "loss/crossentropy": 2.2910103797912598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24870596826076508, "step": 4196 }, { "epoch": 0.08396, "grad_norm": 2.234375, "grad_norm_var": 0.1001617431640625, "learning_rate": 0.0001, "loss": 4.562, "loss/crossentropy": 2.1453208923339844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25001347810029984, "step": 4198 }, { "epoch": 0.084, "grad_norm": 2.359375, "grad_norm_var": 0.0398834228515625, "learning_rate": 0.0001, "loss": 4.8835, "loss/crossentropy": 2.1935043334960938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26807525753974915, "step": 4200 }, { "epoch": 0.08404, "grad_norm": 2.3125, "grad_norm_var": 0.009765625, "learning_rate": 0.0001, "loss": 4.5912, "loss/crossentropy": 2.039341926574707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460380420088768, "step": 4202 }, { "epoch": 0.08408, "grad_norm": 3.09375, "grad_norm_var": 0.0478179931640625, "learning_rate": 0.0001, "loss": 4.8243, "loss/crossentropy": 2.4660122394561768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3042101263999939, "step": 4204 }, { "epoch": 0.08412, "grad_norm": 2.453125, "grad_norm_var": 0.08088785807291667, "learning_rate": 0.0001, "loss": 4.8635, "loss/crossentropy": 1.9346272349357605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25924334675073624, "step": 4206 }, { "epoch": 0.08416, "grad_norm": 2.1875, "grad_norm_var": 0.08311258951822917, "learning_rate": 0.0001, "loss": 4.5152, "loss/crossentropy": 2.0120063424110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24327433109283447, "step": 4208 }, { "epoch": 0.0842, "grad_norm": 2.703125, "grad_norm_var": 0.0852935791015625, "learning_rate": 0.0001, "loss": 4.6148, "loss/crossentropy": 2.2359931468963623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25374244898557663, "step": 4210 }, { "epoch": 0.08424, "grad_norm": 2.53125, "grad_norm_var": 0.08185221354166666, "learning_rate": 0.0001, "loss": 4.5751, "loss/crossentropy": 2.0038134455680847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22315364331007004, "step": 4212 }, { "epoch": 0.08428, "grad_norm": 2.296875, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 4.721, "loss/crossentropy": 2.2041471004486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655494213104248, "step": 4214 }, { "epoch": 0.08432, "grad_norm": 2.390625, "grad_norm_var": 0.08399149576822916, "learning_rate": 0.0001, "loss": 4.8091, "loss/crossentropy": 2.344551682472229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2743126451969147, "step": 4216 }, { "epoch": 0.08436, "grad_norm": 2.453125, "grad_norm_var": 0.08025614420572917, "learning_rate": 0.0001, "loss": 4.7162, "loss/crossentropy": 1.9694250226020813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24425261467695236, "step": 4218 }, { "epoch": 0.0844, "grad_norm": 2.40625, "grad_norm_var": 0.05671284993489583, "learning_rate": 0.0001, "loss": 4.8526, "loss/crossentropy": 2.164921760559082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.272469699382782, "step": 4220 }, { "epoch": 0.08444, "grad_norm": 2.484375, "grad_norm_var": 0.02642822265625, "learning_rate": 0.0001, "loss": 4.5513, "loss/crossentropy": 1.944575309753418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23848393559455872, "step": 4222 }, { "epoch": 0.08448, "grad_norm": 2.359375, "grad_norm_var": 0.025386555989583334, "learning_rate": 0.0001, "loss": 4.7416, "loss/crossentropy": 2.278227686882019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2558315545320511, "step": 4224 }, { "epoch": 0.08452, "grad_norm": 2.453125, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 4.7318, "loss/crossentropy": 2.035117268562317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2585935667157173, "step": 4226 }, { "epoch": 0.08456, "grad_norm": 2.234375, "grad_norm_var": 0.020466105143229166, "learning_rate": 0.0001, "loss": 4.5674, "loss/crossentropy": 2.0172035694122314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23832131922245026, "step": 4228 }, { "epoch": 0.0846, "grad_norm": 2.484375, "grad_norm_var": 0.011165364583333334, "learning_rate": 0.0001, "loss": 4.8113, "loss/crossentropy": 2.0574535727500916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23829226195812225, "step": 4230 }, { "epoch": 0.08464, "grad_norm": 2.3125, "grad_norm_var": 0.010773722330729167, "learning_rate": 0.0001, "loss": 4.6776, "loss/crossentropy": 2.5003366470336914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27905476093292236, "step": 4232 }, { "epoch": 0.08468, "grad_norm": 2.578125, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.9137, "loss/crossentropy": 2.207367777824402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27426937222480774, "step": 4234 }, { "epoch": 0.08472, "grad_norm": 2.1875, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 4.716, "loss/crossentropy": 2.240189790725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26208513230085373, "step": 4236 }, { "epoch": 0.08476, "grad_norm": 2.3125, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 4.4569, "loss/crossentropy": 2.1357412338256836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24108420312404633, "step": 4238 }, { "epoch": 0.0848, "grad_norm": 2.296875, "grad_norm_var": 0.013004557291666666, "learning_rate": 0.0001, "loss": 4.7249, "loss/crossentropy": 2.1073816418647766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25213342159986496, "step": 4240 }, { "epoch": 0.08484, "grad_norm": 2.40625, "grad_norm_var": 0.013102213541666666, "learning_rate": 0.0001, "loss": 4.9558, "loss/crossentropy": 2.158124566078186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505848854780197, "step": 4242 }, { "epoch": 0.08488, "grad_norm": 2.328125, "grad_norm_var": 0.011555989583333334, "learning_rate": 0.0001, "loss": 4.7743, "loss/crossentropy": 2.253539562225342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2675466388463974, "step": 4244 }, { "epoch": 0.08492, "grad_norm": 2.5, "grad_norm_var": 0.01396484375, "learning_rate": 0.0001, "loss": 4.272, "loss/crossentropy": 1.7170023918151855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20157357305288315, "step": 4246 }, { "epoch": 0.08496, "grad_norm": 2.40625, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 4.4479, "loss/crossentropy": 2.082640767097473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2600134015083313, "step": 4248 }, { "epoch": 0.085, "grad_norm": 2.34375, "grad_norm_var": 0.0165679931640625, "learning_rate": 0.0001, "loss": 4.4176, "loss/crossentropy": 2.044301390647888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23220707476139069, "step": 4250 }, { "epoch": 0.08504, "grad_norm": 2.453125, "grad_norm_var": 0.0168121337890625, "learning_rate": 0.0001, "loss": 4.648, "loss/crossentropy": 2.293405532836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28027066588401794, "step": 4252 }, { "epoch": 0.08508, "grad_norm": 2.265625, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 4.55, "loss/crossentropy": 2.2604206800460815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25545646995306015, "step": 4254 }, { "epoch": 0.08512, "grad_norm": 2.65625, "grad_norm_var": 0.023949178059895833, "learning_rate": 0.0001, "loss": 4.6258, "loss/crossentropy": 2.118361234664917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24799348413944244, "step": 4256 }, { "epoch": 0.08516, "grad_norm": 2.390625, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 4.6751, "loss/crossentropy": 1.9369969964027405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24676478654146194, "step": 4258 }, { "epoch": 0.0852, "grad_norm": 2.40625, "grad_norm_var": 0.021728515625, "learning_rate": 0.0001, "loss": 4.5197, "loss/crossentropy": 2.075170874595642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21742676943540573, "step": 4260 }, { "epoch": 0.08524, "grad_norm": 2.296875, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 4.4112, "loss/crossentropy": 2.056099236011505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2490842342376709, "step": 4262 }, { "epoch": 0.08528, "grad_norm": 2.328125, "grad_norm_var": 0.014436848958333333, "learning_rate": 0.0001, "loss": 4.6169, "loss/crossentropy": 2.2279993891716003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24964337795972824, "step": 4264 }, { "epoch": 0.08532, "grad_norm": 2.375, "grad_norm_var": 0.011393229166666666, "learning_rate": 0.0001, "loss": 4.6686, "loss/crossentropy": 2.1645933389663696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24576786905527115, "step": 4266 }, { "epoch": 0.08536, "grad_norm": 2.25, "grad_norm_var": 0.010887654622395833, "learning_rate": 0.0001, "loss": 4.4458, "loss/crossentropy": 1.9033920764923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23393237590789795, "step": 4268 }, { "epoch": 0.0854, "grad_norm": 2.46875, "grad_norm_var": 0.011865234375, "learning_rate": 0.0001, "loss": 4.4022, "loss/crossentropy": 2.153634190559387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2496839165687561, "step": 4270 }, { "epoch": 0.08544, "grad_norm": 2.28125, "grad_norm_var": 0.0045206705729166664, "learning_rate": 0.0001, "loss": 4.4781, "loss/crossentropy": 1.9188589453697205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23544569313526154, "step": 4272 }, { "epoch": 0.08548, "grad_norm": 2.328125, "grad_norm_var": 0.004264322916666666, "learning_rate": 0.0001, "loss": 4.704, "loss/crossentropy": 2.4337977170944214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2951700836420059, "step": 4274 }, { "epoch": 0.08552, "grad_norm": 2.359375, "grad_norm_var": 0.003902180989583333, "learning_rate": 0.0001, "loss": 4.7051, "loss/crossentropy": 1.9108383059501648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24459081888198853, "step": 4276 }, { "epoch": 0.08556, "grad_norm": 2.375, "grad_norm_var": 0.003123982747395833, "learning_rate": 0.0001, "loss": 4.3751, "loss/crossentropy": 1.6632736921310425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22495487332344055, "step": 4278 }, { "epoch": 0.0856, "grad_norm": 2.296875, "grad_norm_var": 0.021312459309895834, "learning_rate": 0.0001, "loss": 4.8144, "loss/crossentropy": 2.519997477531433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2657178193330765, "step": 4280 }, { "epoch": 0.08564, "grad_norm": 2.640625, "grad_norm_var": 0.025275675455729167, "learning_rate": 0.0001, "loss": 4.6837, "loss/crossentropy": 2.150822162628174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30192580074071884, "step": 4282 }, { "epoch": 0.08568, "grad_norm": 2.421875, "grad_norm_var": 0.023763020833333332, "learning_rate": 0.0001, "loss": 4.7411, "loss/crossentropy": 1.9970062971115112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24695321917533875, "step": 4284 }, { "epoch": 0.08572, "grad_norm": 2.25, "grad_norm_var": 0.024372355143229166, "learning_rate": 0.0001, "loss": 4.5361, "loss/crossentropy": 2.3136098384857178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25082121044397354, "step": 4286 }, { "epoch": 0.08576, "grad_norm": 2.5625, "grad_norm_var": 0.025992838541666667, "learning_rate": 0.0001, "loss": 4.9171, "loss/crossentropy": 2.112035870552063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27019062638282776, "step": 4288 }, { "epoch": 0.0858, "grad_norm": 2.328125, "grad_norm_var": 0.025992838541666667, "learning_rate": 0.0001, "loss": 4.4985, "loss/crossentropy": 2.068653643131256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24297921359539032, "step": 4290 }, { "epoch": 0.08584, "grad_norm": 2.375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 4.5182, "loss/crossentropy": 1.9013578295707703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23058265447616577, "step": 4292 }, { "epoch": 0.08588, "grad_norm": 2.15625, "grad_norm_var": 0.0294830322265625, "learning_rate": 0.0001, "loss": 4.6825, "loss/crossentropy": 2.149984359741211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25223904848098755, "step": 4294 }, { "epoch": 0.08592, "grad_norm": 2.328125, "grad_norm_var": 0.014469401041666666, "learning_rate": 0.0001, "loss": 4.4109, "loss/crossentropy": 1.894010066986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2323223054409027, "step": 4296 }, { "epoch": 0.08596, "grad_norm": 2.421875, "grad_norm_var": 0.010123697916666667, "learning_rate": 0.0001, "loss": 4.7653, "loss/crossentropy": 2.3351621627807617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27194930613040924, "step": 4298 }, { "epoch": 0.086, "grad_norm": 2.328125, "grad_norm_var": 0.0097320556640625, "learning_rate": 0.0001, "loss": 4.741, "loss/crossentropy": 2.224352180957794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24884501099586487, "step": 4300 }, { "epoch": 0.08604, "grad_norm": 2.421875, "grad_norm_var": 0.008854166666666666, "learning_rate": 0.0001, "loss": 4.6592, "loss/crossentropy": 1.908318042755127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24747492372989655, "step": 4302 }, { "epoch": 0.08608, "grad_norm": 2.296875, "grad_norm_var": 0.00758056640625, "learning_rate": 0.0001, "loss": 4.8566, "loss/crossentropy": 2.1990396976470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27217794954776764, "step": 4304 }, { "epoch": 0.08612, "grad_norm": 2.390625, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.6432, "loss/crossentropy": 2.3146010637283325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26875850558280945, "step": 4306 }, { "epoch": 0.08616, "grad_norm": 2.46875, "grad_norm_var": 0.015208943684895834, "learning_rate": 0.0001, "loss": 4.8254, "loss/crossentropy": 2.2507941722869873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27052539587020874, "step": 4308 }, { "epoch": 0.0862, "grad_norm": 2.234375, "grad_norm_var": 0.013199869791666667, "learning_rate": 0.0001, "loss": 4.4067, "loss/crossentropy": 1.9077125787734985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22987178713083267, "step": 4310 }, { "epoch": 0.08624, "grad_norm": 2.515625, "grad_norm_var": 0.01353759765625, "learning_rate": 0.0001, "loss": 4.4822, "loss/crossentropy": 1.951395332813263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2392890453338623, "step": 4312 }, { "epoch": 0.08628, "grad_norm": 2.53125, "grad_norm_var": 0.033854166666666664, "learning_rate": 0.0001, "loss": 4.5371, "loss/crossentropy": 1.9426860213279724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24374966323375702, "step": 4314 }, { "epoch": 0.08632, "grad_norm": 3.390625, "grad_norm_var": 0.09062398274739583, "learning_rate": 0.0001, "loss": 5.253, "loss/crossentropy": 2.2508288621902466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3862452507019043, "step": 4316 }, { "epoch": 0.08636, "grad_norm": 2.359375, "grad_norm_var": 0.09058329264322916, "learning_rate": 0.0001, "loss": 4.5288, "loss/crossentropy": 2.1161463260650635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557579278945923, "step": 4318 }, { "epoch": 0.0864, "grad_norm": 2.3125, "grad_norm_var": 0.09374593098958334, "learning_rate": 0.0001, "loss": 5.1146, "loss/crossentropy": 2.2570544481277466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31447017192840576, "step": 4320 }, { "epoch": 0.08644, "grad_norm": 2.328125, "grad_norm_var": 0.08740234375, "learning_rate": 0.0001, "loss": 4.9724, "loss/crossentropy": 2.3211100101470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26107798516750336, "step": 4322 }, { "epoch": 0.08648, "grad_norm": 2.25, "grad_norm_var": 0.09231669108072917, "learning_rate": 0.0001, "loss": 4.5236, "loss/crossentropy": 2.1451058387756348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23633568733930588, "step": 4324 }, { "epoch": 0.08652, "grad_norm": 2.21875, "grad_norm_var": 0.09463602701822917, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 1.9880141615867615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23688867688179016, "step": 4326 }, { "epoch": 0.08656, "grad_norm": 2.296875, "grad_norm_var": 0.09724934895833333, "learning_rate": 0.0001, "loss": 4.7098, "loss/crossentropy": 2.021056890487671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2543798238039017, "step": 4328 }, { "epoch": 0.0866, "grad_norm": 2.359375, "grad_norm_var": 0.0814453125, "learning_rate": 0.0001, "loss": 4.6439, "loss/crossentropy": 2.1323755979537964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27410852909088135, "step": 4330 }, { "epoch": 0.08664, "grad_norm": 2.296875, "grad_norm_var": 0.014940388997395833, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 1.9674875736236572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24818265438079834, "step": 4332 }, { "epoch": 0.08668, "grad_norm": 2.25, "grad_norm_var": 0.01490478515625, "learning_rate": 0.0001, "loss": 4.1911, "loss/crossentropy": 2.0466583967208862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567761391401291, "step": 4334 }, { "epoch": 0.08672, "grad_norm": 2.375, "grad_norm_var": 0.004423014322916667, "learning_rate": 0.0001, "loss": 4.5937, "loss/crossentropy": 2.138857126235962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26430967450141907, "step": 4336 }, { "epoch": 0.08676, "grad_norm": 2.390625, "grad_norm_var": 0.003902180989583333, "learning_rate": 0.0001, "loss": 4.7168, "loss/crossentropy": 2.164841413497925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24389629065990448, "step": 4338 }, { "epoch": 0.0868, "grad_norm": 2.4375, "grad_norm_var": 0.005322265625, "learning_rate": 0.0001, "loss": 4.6017, "loss/crossentropy": 2.2220189571380615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24361558258533478, "step": 4340 }, { "epoch": 0.08684, "grad_norm": 2.390625, "grad_norm_var": 0.004792277018229167, "learning_rate": 0.0001, "loss": 4.3088, "loss/crossentropy": 1.7106285095214844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21558403968811035, "step": 4342 }, { "epoch": 0.08688, "grad_norm": 2.203125, "grad_norm_var": 0.0053670247395833336, "learning_rate": 0.0001, "loss": 4.1647, "loss/crossentropy": 1.9200173020362854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502904310822487, "step": 4344 }, { "epoch": 0.08692, "grad_norm": 2.828125, "grad_norm_var": 0.020796712239583334, "learning_rate": 0.0001, "loss": 4.6817, "loss/crossentropy": 1.883722960948944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24818243086338043, "step": 4346 }, { "epoch": 0.08696, "grad_norm": 2.484375, "grad_norm_var": 0.026146443684895833, "learning_rate": 0.0001, "loss": 4.7509, "loss/crossentropy": 2.2069878578186035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2518744319677353, "step": 4348 }, { "epoch": 0.087, "grad_norm": 2.484375, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 4.7235, "loss/crossentropy": 2.2158325910568237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551049590110779, "step": 4350 }, { "epoch": 0.08704, "grad_norm": 2.359375, "grad_norm_var": 0.02730712890625, "learning_rate": 0.0001, "loss": 4.8406, "loss/crossentropy": 2.0580105781555176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2742728739976883, "step": 4352 }, { "epoch": 0.08708, "grad_norm": 2.34375, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 4.5267, "loss/crossentropy": 2.190276265144348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.274506613612175, "step": 4354 }, { "epoch": 0.08712, "grad_norm": 2.3125, "grad_norm_var": 0.027242024739583332, "learning_rate": 0.0001, "loss": 4.7499, "loss/crossentropy": 2.2595328092575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25295622646808624, "step": 4356 }, { "epoch": 0.08716, "grad_norm": 2.4375, "grad_norm_var": 0.027391560872395835, "learning_rate": 0.0001, "loss": 5.1247, "loss/crossentropy": 2.322342872619629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2490948587656021, "step": 4358 }, { "epoch": 0.0872, "grad_norm": 2.328125, "grad_norm_var": 0.024665323893229167, "learning_rate": 0.0001, "loss": 4.7045, "loss/crossentropy": 2.108223795890808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25404803454875946, "step": 4360 }, { "epoch": 0.08724, "grad_norm": 2.3125, "grad_norm_var": 0.013825480143229167, "learning_rate": 0.0001, "loss": 4.5332, "loss/crossentropy": 2.1363136768341064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2679157853126526, "step": 4362 }, { "epoch": 0.08728, "grad_norm": 2.390625, "grad_norm_var": 0.0076171875, "learning_rate": 0.0001, "loss": 4.6418, "loss/crossentropy": 2.3207738399505615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2725762128829956, "step": 4364 }, { "epoch": 0.08732, "grad_norm": 2.40625, "grad_norm_var": 0.004813639322916666, "learning_rate": 0.0001, "loss": 4.7431, "loss/crossentropy": 2.3179128170013428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27275949716567993, "step": 4366 }, { "epoch": 0.08736, "grad_norm": 2.328125, "grad_norm_var": 0.005182902018229167, "learning_rate": 0.0001, "loss": 4.7355, "loss/crossentropy": 2.2130206823349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2787477523088455, "step": 4368 }, { "epoch": 0.0874, "grad_norm": 2.40625, "grad_norm_var": 0.004548136393229167, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.3350926637649536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26483266800642014, "step": 4370 }, { "epoch": 0.08744, "grad_norm": 2.4375, "grad_norm_var": 0.005736287434895833, "learning_rate": 0.0001, "loss": 4.7329, "loss/crossentropy": 1.9162638187408447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23748356848955154, "step": 4372 }, { "epoch": 0.08748, "grad_norm": 2.484375, "grad_norm_var": 0.006322224934895833, "learning_rate": 0.0001, "loss": 4.7045, "loss/crossentropy": 2.2708429098129272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2709425985813141, "step": 4374 }, { "epoch": 0.08752, "grad_norm": 2.234375, "grad_norm_var": 0.011604817708333333, "learning_rate": 0.0001, "loss": 4.3481, "loss/crossentropy": 1.7216318845748901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22498781234025955, "step": 4376 }, { "epoch": 0.08756, "grad_norm": 2.328125, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 4.4261, "loss/crossentropy": 2.144331693649292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25400668382644653, "step": 4378 }, { "epoch": 0.0876, "grad_norm": 2.328125, "grad_norm_var": 0.011750284830729167, "learning_rate": 0.0001, "loss": 4.5617, "loss/crossentropy": 2.305369734764099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29137127101421356, "step": 4380 }, { "epoch": 0.08764, "grad_norm": 2.65625, "grad_norm_var": 0.018115234375, "learning_rate": 0.0001, "loss": 4.6861, "loss/crossentropy": 2.1156765818595886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24203064292669296, "step": 4382 }, { "epoch": 0.08768, "grad_norm": 2.28125, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 4.4544, "loss/crossentropy": 1.9081769585609436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22396781295537949, "step": 4384 }, { "epoch": 0.08772, "grad_norm": 2.265625, "grad_norm_var": 0.017769368489583333, "learning_rate": 0.0001, "loss": 4.704, "loss/crossentropy": 2.0938609838485718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27330371737480164, "step": 4386 }, { "epoch": 0.08776, "grad_norm": 2.34375, "grad_norm_var": 0.017748006184895835, "learning_rate": 0.0001, "loss": 4.1179, "loss/crossentropy": 1.9685207605361938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2287725731730461, "step": 4388 }, { "epoch": 0.0878, "grad_norm": 2.3125, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 4.5233, "loss/crossentropy": 1.7536925673484802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345883920788765, "step": 4390 }, { "epoch": 0.08784, "grad_norm": 2.359375, "grad_norm_var": 0.029150390625, "learning_rate": 0.0001, "loss": 4.3897, "loss/crossentropy": 2.145567834377289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26126645505428314, "step": 4392 }, { "epoch": 0.08788, "grad_norm": 2.390625, "grad_norm_var": 0.028348795572916665, "learning_rate": 0.0001, "loss": 4.4677, "loss/crossentropy": 2.2211687564849854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3332909345626831, "step": 4394 }, { "epoch": 0.08792, "grad_norm": 2.375, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 4.5545, "loss/crossentropy": 1.919084072113037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2480143904685974, "step": 4396 }, { "epoch": 0.08796, "grad_norm": 2.421875, "grad_norm_var": 0.025121053059895832, "learning_rate": 0.0001, "loss": 4.6408, "loss/crossentropy": 2.1769548654556274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.269208699464798, "step": 4398 }, { "epoch": 0.088, "grad_norm": 2.5, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 4.7372, "loss/crossentropy": 1.8480086922645569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2432136833667755, "step": 4400 }, { "epoch": 0.08804, "grad_norm": 2.46875, "grad_norm_var": 0.026838175455729165, "learning_rate": 0.0001, "loss": 4.7303, "loss/crossentropy": 2.1948903799057007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28328536450862885, "step": 4402 }, { "epoch": 0.08808, "grad_norm": 2.40625, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 4.6929, "loss/crossentropy": 2.163570761680603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.282375693321228, "step": 4404 }, { "epoch": 0.08812, "grad_norm": 2.34375, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 4.6882, "loss/crossentropy": 2.3737696409225464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26703887432813644, "step": 4406 }, { "epoch": 0.08816, "grad_norm": 2.71875, "grad_norm_var": 0.015192667643229166, "learning_rate": 0.0001, "loss": 4.6959, "loss/crossentropy": 2.2449779510498047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2744368612766266, "step": 4408 }, { "epoch": 0.0882, "grad_norm": 2.515625, "grad_norm_var": 0.015348307291666667, "learning_rate": 0.0001, "loss": 4.6602, "loss/crossentropy": 2.1167399287223816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28688907623291016, "step": 4410 }, { "epoch": 0.08824, "grad_norm": 2.3125, "grad_norm_var": 0.017118326822916665, "learning_rate": 0.0001, "loss": 4.3777, "loss/crossentropy": 2.2249929904937744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24534232914447784, "step": 4412 }, { "epoch": 0.08828, "grad_norm": 2.421875, "grad_norm_var": 0.019261678059895832, "learning_rate": 0.0001, "loss": 4.7013, "loss/crossentropy": 2.172566771507263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2783561646938324, "step": 4414 }, { "epoch": 0.08832, "grad_norm": 2.234375, "grad_norm_var": 0.018684895833333333, "learning_rate": 0.0001, "loss": 4.3536, "loss/crossentropy": 2.0709031224250793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419627606868744, "step": 4416 }, { "epoch": 0.08836, "grad_norm": 3.359375, "grad_norm_var": 0.0735260009765625, "learning_rate": 0.0001, "loss": 4.8378, "loss/crossentropy": 2.2390655279159546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27328142523765564, "step": 4418 }, { "epoch": 0.0884, "grad_norm": 2.59375, "grad_norm_var": 0.07681884765625, "learning_rate": 0.0001, "loss": 4.6879, "loss/crossentropy": 2.061118960380554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23206621408462524, "step": 4420 }, { "epoch": 0.08844, "grad_norm": 2.140625, "grad_norm_var": 0.09147847493489583, "learning_rate": 0.0001, "loss": 4.3028, "loss/crossentropy": 1.4919558763504028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17881463468074799, "step": 4422 }, { "epoch": 0.08848, "grad_norm": 2.53125, "grad_norm_var": 0.08662109375, "learning_rate": 0.0001, "loss": 4.7705, "loss/crossentropy": 2.267301082611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27718164026737213, "step": 4424 }, { "epoch": 0.08852, "grad_norm": 2.375, "grad_norm_var": 0.0862457275390625, "learning_rate": 0.0001, "loss": 4.635, "loss/crossentropy": 1.9008094668388367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22865734994411469, "step": 4426 }, { "epoch": 0.08856, "grad_norm": 2.421875, "grad_norm_var": 0.090966796875, "learning_rate": 0.0001, "loss": 4.385, "loss/crossentropy": 1.8788060545921326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23521529138088226, "step": 4428 }, { "epoch": 0.0886, "grad_norm": 2.546875, "grad_norm_var": 0.08844401041666666, "learning_rate": 0.0001, "loss": 4.6703, "loss/crossentropy": 2.0775802731513977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3402235209941864, "step": 4430 }, { "epoch": 0.08864, "grad_norm": 2.515625, "grad_norm_var": 0.08642171223958334, "learning_rate": 0.0001, "loss": 4.5722, "loss/crossentropy": 2.0950201749801636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2665044367313385, "step": 4432 }, { "epoch": 0.08868, "grad_norm": 2.40625, "grad_norm_var": 0.026167805989583334, "learning_rate": 0.0001, "loss": 4.9235, "loss/crossentropy": 2.328200340270996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24911007285118103, "step": 4434 }, { "epoch": 0.08872, "grad_norm": 2.515625, "grad_norm_var": 0.023273722330729166, "learning_rate": 0.0001, "loss": 4.7462, "loss/crossentropy": 2.1840142011642456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855416387319565, "step": 4436 }, { "epoch": 0.08876, "grad_norm": 2.25, "grad_norm_var": 0.015559895833333334, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.6167555451393127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21646380424499512, "step": 4438 }, { "epoch": 0.0888, "grad_norm": 2.25, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 4.5274, "loss/crossentropy": 1.9902858138084412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.237789124250412, "step": 4440 }, { "epoch": 0.08884, "grad_norm": 2.359375, "grad_norm_var": 0.016357421875, "learning_rate": 0.0001, "loss": 4.6682, "loss/crossentropy": 2.4779287576675415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2726414203643799, "step": 4442 }, { "epoch": 0.08888, "grad_norm": 2.46875, "grad_norm_var": 0.0116119384765625, "learning_rate": 0.0001, "loss": 4.7415, "loss/crossentropy": 2.0914896726608276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22887279838323593, "step": 4444 }, { "epoch": 0.08892, "grad_norm": 2.265625, "grad_norm_var": 0.010570271809895834, "learning_rate": 0.0001, "loss": 4.498, "loss/crossentropy": 2.0526055693626404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2509382963180542, "step": 4446 }, { "epoch": 0.08896, "grad_norm": 2.15625, "grad_norm_var": 0.010347493489583333, "learning_rate": 0.0001, "loss": 4.4306, "loss/crossentropy": 1.9779084920883179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320951297879219, "step": 4448 }, { "epoch": 0.089, "grad_norm": 2.421875, "grad_norm_var": 0.010399373372395833, "learning_rate": 0.0001, "loss": 4.7725, "loss/crossentropy": 2.2081698179244995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29023079574108124, "step": 4450 }, { "epoch": 0.08904, "grad_norm": 2.296875, "grad_norm_var": 0.0074127197265625, "learning_rate": 0.0001, "loss": 4.542, "loss/crossentropy": 1.834806501865387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23430980741977692, "step": 4452 }, { "epoch": 0.08908, "grad_norm": 2.28125, "grad_norm_var": 0.006884765625, "learning_rate": 0.0001, "loss": 4.7087, "loss/crossentropy": 2.4750468730926514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27945323288440704, "step": 4454 }, { "epoch": 0.08912, "grad_norm": 2.28125, "grad_norm_var": 0.007255045572916666, "learning_rate": 0.0001, "loss": 4.6822, "loss/crossentropy": 2.1766942739486694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28881968557834625, "step": 4456 }, { "epoch": 0.08916, "grad_norm": 2.28125, "grad_norm_var": 0.009065755208333333, "learning_rate": 0.0001, "loss": 4.9204, "loss/crossentropy": 2.265195846557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2616717368364334, "step": 4458 }, { "epoch": 0.0892, "grad_norm": 2.171875, "grad_norm_var": 0.011473592122395833, "learning_rate": 0.0001, "loss": 4.5747, "loss/crossentropy": 2.2438716888427734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2630545049905777, "step": 4460 }, { "epoch": 0.08924, "grad_norm": 2.28125, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 4.3404, "loss/crossentropy": 2.060324013233185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639824986457825, "step": 4462 }, { "epoch": 0.08928, "grad_norm": 2.453125, "grad_norm_var": 0.0331207275390625, "learning_rate": 0.0001, "loss": 4.733, "loss/crossentropy": 1.8830525279045105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21016474813222885, "step": 4464 }, { "epoch": 0.08932, "grad_norm": 2.484375, "grad_norm_var": 0.03435872395833333, "learning_rate": 0.0001, "loss": 4.7571, "loss/crossentropy": 2.3420846462249756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28108468651771545, "step": 4466 }, { "epoch": 0.08936, "grad_norm": 2.515625, "grad_norm_var": 0.0337310791015625, "learning_rate": 0.0001, "loss": 4.6851, "loss/crossentropy": 1.9140342473983765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22587041556835175, "step": 4468 }, { "epoch": 0.0894, "grad_norm": 2.390625, "grad_norm_var": 0.032613118489583336, "learning_rate": 0.0001, "loss": 4.79, "loss/crossentropy": 1.9753122925758362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24506878852844238, "step": 4470 }, { "epoch": 0.08944, "grad_norm": 2.3125, "grad_norm_var": 0.032389322916666664, "learning_rate": 0.0001, "loss": 4.7769, "loss/crossentropy": 2.0735195875167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2763114273548126, "step": 4472 }, { "epoch": 0.08948, "grad_norm": 2.28125, "grad_norm_var": 0.034403483072916664, "learning_rate": 0.0001, "loss": 4.3593, "loss/crossentropy": 1.7784460186958313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22829323261976242, "step": 4474 }, { "epoch": 0.08952, "grad_norm": 2.625, "grad_norm_var": 0.051041666666666666, "learning_rate": 0.0001, "loss": 4.9451, "loss/crossentropy": 2.1188095808029175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27410270273685455, "step": 4476 }, { "epoch": 0.08956, "grad_norm": 2.421875, "grad_norm_var": 0.051634724934895834, "learning_rate": 0.0001, "loss": 4.3392, "loss/crossentropy": 2.320235252380371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27514997124671936, "step": 4478 }, { "epoch": 0.0896, "grad_norm": 2.265625, "grad_norm_var": 0.03766988118489583, "learning_rate": 0.0001, "loss": 4.8345, "loss/crossentropy": 2.3023080825805664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2516366094350815, "step": 4480 }, { "epoch": 0.08964, "grad_norm": 2.390625, "grad_norm_var": 0.039567057291666666, "learning_rate": 0.0001, "loss": 4.6698, "loss/crossentropy": 1.9677563905715942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362382560968399, "step": 4482 }, { "epoch": 0.08968, "grad_norm": 2.34375, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 4.9159, "loss/crossentropy": 2.2005198001861572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2760336175560951, "step": 4484 }, { "epoch": 0.08972, "grad_norm": 2.0625, "grad_norm_var": 0.0478912353515625, "learning_rate": 0.0001, "loss": 4.4418, "loss/crossentropy": 1.9799931049346924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25132423639297485, "step": 4486 }, { "epoch": 0.08976, "grad_norm": 2.46875, "grad_norm_var": 0.10212300618489584, "learning_rate": 0.0001, "loss": 4.9643, "loss/crossentropy": 2.205371141433716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2695635259151459, "step": 4488 }, { "epoch": 0.0898, "grad_norm": 2.421875, "grad_norm_var": 0.09692281087239583, "learning_rate": 0.0001, "loss": 4.9695, "loss/crossentropy": 2.3500062227249146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2736963629722595, "step": 4490 }, { "epoch": 0.08984, "grad_norm": 2.59375, "grad_norm_var": 0.07867431640625, "learning_rate": 0.0001, "loss": 4.9083, "loss/crossentropy": 2.386352837085724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2472890019416809, "step": 4492 }, { "epoch": 0.08988, "grad_norm": 2.265625, "grad_norm_var": 0.07701416015625, "learning_rate": 0.0001, "loss": 4.3598, "loss/crossentropy": 1.9863982200622559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288871705532074, "step": 4494 }, { "epoch": 0.08992, "grad_norm": 2.34375, "grad_norm_var": 0.07757059733072917, "learning_rate": 0.0001, "loss": 4.6157, "loss/crossentropy": 2.4088594913482666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24304527044296265, "step": 4496 }, { "epoch": 0.08996, "grad_norm": 2.5, "grad_norm_var": 0.07517801920572917, "learning_rate": 0.0001, "loss": 4.7457, "loss/crossentropy": 2.1663140058517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2681227922439575, "step": 4498 }, { "epoch": 0.09, "grad_norm": 2.171875, "grad_norm_var": 0.0810455322265625, "learning_rate": 0.0001, "loss": 4.2199, "loss/crossentropy": 1.9233570098876953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23169977217912674, "step": 4500 }, { "epoch": 0.09004, "grad_norm": 2.40625, "grad_norm_var": 0.07579752604166666, "learning_rate": 0.0001, "loss": 4.4677, "loss/crossentropy": 2.2940425872802734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24809539318084717, "step": 4502 }, { "epoch": 0.09008, "grad_norm": 2.21875, "grad_norm_var": 0.015771484375, "learning_rate": 0.0001, "loss": 4.5333, "loss/crossentropy": 2.1874176263809204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26286616921424866, "step": 4504 }, { "epoch": 0.09012, "grad_norm": 2.28125, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 4.5029, "loss/crossentropy": 2.190543472766876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26766398549079895, "step": 4506 }, { "epoch": 0.09016, "grad_norm": 2.296875, "grad_norm_var": 0.010823567708333334, "learning_rate": 0.0001, "loss": 4.464, "loss/crossentropy": 2.132491707801819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2514026165008545, "step": 4508 }, { "epoch": 0.0902, "grad_norm": 2.296875, "grad_norm_var": 0.0107818603515625, "learning_rate": 0.0001, "loss": 4.5032, "loss/crossentropy": 2.1492353677749634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25663119554519653, "step": 4510 }, { "epoch": 0.09024, "grad_norm": 2.203125, "grad_norm_var": 0.011454264322916666, "learning_rate": 0.0001, "loss": 4.6166, "loss/crossentropy": 2.2471927404403687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24812395125627518, "step": 4512 }, { "epoch": 0.09028, "grad_norm": 2.34375, "grad_norm_var": 0.007013956705729167, "learning_rate": 0.0001, "loss": 4.5651, "loss/crossentropy": 2.1944304704666138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26567137241363525, "step": 4514 }, { "epoch": 0.09032, "grad_norm": 2.203125, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 4.3054, "loss/crossentropy": 1.7537739872932434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21784386038780212, "step": 4516 }, { "epoch": 0.09036, "grad_norm": 2.46875, "grad_norm_var": 0.006981404622395834, "learning_rate": 0.0001, "loss": 4.6961, "loss/crossentropy": 2.133580207824707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27865441143512726, "step": 4518 }, { "epoch": 0.0904, "grad_norm": 2.5, "grad_norm_var": 0.008723958333333334, "learning_rate": 0.0001, "loss": 4.5486, "loss/crossentropy": 2.0858315229415894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26390860974788666, "step": 4520 }, { "epoch": 0.09044, "grad_norm": 2.3125, "grad_norm_var": 0.008101399739583333, "learning_rate": 0.0001, "loss": 4.6833, "loss/crossentropy": 1.9084516763687134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23013630509376526, "step": 4522 }, { "epoch": 0.09048, "grad_norm": 2.234375, "grad_norm_var": 0.00859375, "learning_rate": 0.0001, "loss": 4.5817, "loss/crossentropy": 2.2123712301254272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2739466577768326, "step": 4524 }, { "epoch": 0.09052, "grad_norm": 2.453125, "grad_norm_var": 0.009691365559895833, "learning_rate": 0.0001, "loss": 4.9898, "loss/crossentropy": 2.3532934188842773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25509175658226013, "step": 4526 }, { "epoch": 0.09056, "grad_norm": 2.328125, "grad_norm_var": 0.008137003580729166, "learning_rate": 0.0001, "loss": 4.7786, "loss/crossentropy": 2.1543048620224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639082163572311, "step": 4528 }, { "epoch": 0.0906, "grad_norm": 2.4375, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 4.647, "loss/crossentropy": 2.1322286128997803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23874164372682571, "step": 4530 }, { "epoch": 0.09064, "grad_norm": 2.25, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.7623, "loss/crossentropy": 2.5552597045898438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24949757009744644, "step": 4532 }, { "epoch": 0.09068, "grad_norm": 2.421875, "grad_norm_var": 0.0117095947265625, "learning_rate": 0.0001, "loss": 4.6999, "loss/crossentropy": 2.1248819231987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2795845717191696, "step": 4534 }, { "epoch": 0.09072, "grad_norm": 2.390625, "grad_norm_var": 0.014655558268229167, "learning_rate": 0.0001, "loss": 4.7739, "loss/crossentropy": 1.985984206199646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22764238715171814, "step": 4536 }, { "epoch": 0.09076, "grad_norm": 2.328125, "grad_norm_var": 0.01519775390625, "learning_rate": 0.0001, "loss": 4.64, "loss/crossentropy": 2.220720648765564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2693287283182144, "step": 4538 }, { "epoch": 0.0908, "grad_norm": 2.3125, "grad_norm_var": 0.01324462890625, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 1.9541595578193665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419680804014206, "step": 4540 }, { "epoch": 0.09084, "grad_norm": 2.53125, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 4.963, "loss/crossentropy": 2.275113582611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2562567666172981, "step": 4542 }, { "epoch": 0.09088, "grad_norm": 2.3125, "grad_norm_var": 0.017122395833333335, "learning_rate": 0.0001, "loss": 4.7535, "loss/crossentropy": 2.4411803483963013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25798996537923813, "step": 4544 }, { "epoch": 0.09092, "grad_norm": 2.34375, "grad_norm_var": 0.019823201497395835, "learning_rate": 0.0001, "loss": 4.7339, "loss/crossentropy": 2.035769820213318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25313758105039597, "step": 4546 }, { "epoch": 0.09096, "grad_norm": 2.171875, "grad_norm_var": 0.018701171875, "learning_rate": 0.0001, "loss": 4.1331, "loss/crossentropy": 1.9237529039382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24456002563238144, "step": 4548 }, { "epoch": 0.091, "grad_norm": 2.234375, "grad_norm_var": 0.02281494140625, "learning_rate": 0.0001, "loss": 4.6762, "loss/crossentropy": 2.179704189300537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2681535929441452, "step": 4550 }, { "epoch": 0.09104, "grad_norm": 2.46875, "grad_norm_var": 0.024137369791666665, "learning_rate": 0.0001, "loss": 4.6899, "loss/crossentropy": 2.013023316860199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22643990069627762, "step": 4552 }, { "epoch": 0.09108, "grad_norm": 2.453125, "grad_norm_var": 0.024201456705729166, "learning_rate": 0.0001, "loss": 4.6527, "loss/crossentropy": 2.173883557319641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26495447754859924, "step": 4554 }, { "epoch": 0.09112, "grad_norm": 2.296875, "grad_norm_var": 0.025121053059895832, "learning_rate": 0.0001, "loss": 4.5323, "loss/crossentropy": 1.9398870468139648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24307211488485336, "step": 4556 }, { "epoch": 0.09116, "grad_norm": 2.28125, "grad_norm_var": 0.025423177083333335, "learning_rate": 0.0001, "loss": 4.2028, "loss/crossentropy": 1.8551223874092102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21659143269062042, "step": 4558 }, { "epoch": 0.0912, "grad_norm": 2.375, "grad_norm_var": 0.0254058837890625, "learning_rate": 0.0001, "loss": 4.7813, "loss/crossentropy": 2.104207456111908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26515253633260727, "step": 4560 }, { "epoch": 0.09124, "grad_norm": 2.484375, "grad_norm_var": 0.023875935872395834, "learning_rate": 0.0001, "loss": 4.6241, "loss/crossentropy": 2.31631863117218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25454505532979965, "step": 4562 }, { "epoch": 0.09128, "grad_norm": 2.265625, "grad_norm_var": 0.021800740559895834, "learning_rate": 0.0001, "loss": 4.5732, "loss/crossentropy": 2.331356406211853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2667195200920105, "step": 4564 }, { "epoch": 0.09132, "grad_norm": 2.515625, "grad_norm_var": 0.0176910400390625, "learning_rate": 0.0001, "loss": 4.6714, "loss/crossentropy": 2.2126184701919556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24132181704044342, "step": 4566 }, { "epoch": 0.09136, "grad_norm": 2.25, "grad_norm_var": 0.013167317708333333, "learning_rate": 0.0001, "loss": 4.5753, "loss/crossentropy": 2.330659508705139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2838585078716278, "step": 4568 }, { "epoch": 0.0914, "grad_norm": 2.28125, "grad_norm_var": 0.0132720947265625, "learning_rate": 0.0001, "loss": 4.4935, "loss/crossentropy": 2.167214035987854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24021611362695694, "step": 4570 }, { "epoch": 0.09144, "grad_norm": 2.15625, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 4.5847, "loss/crossentropy": 1.770102322101593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108836993575096, "step": 4572 }, { "epoch": 0.09148, "grad_norm": 2.234375, "grad_norm_var": 0.015425618489583333, "learning_rate": 0.0001, "loss": 4.2906, "loss/crossentropy": 1.9293717741966248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2186967208981514, "step": 4574 }, { "epoch": 0.09152, "grad_norm": 2.484375, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.8128, "loss/crossentropy": 2.4099135398864746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25587645173072815, "step": 4576 }, { "epoch": 0.09156, "grad_norm": 2.359375, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 4.5727, "loss/crossentropy": 1.7967870831489563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2521464377641678, "step": 4578 }, { "epoch": 0.0916, "grad_norm": 2.46875, "grad_norm_var": 0.0616851806640625, "learning_rate": 0.0001, "loss": 4.7604, "loss/crossentropy": 1.989583432674408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22892683744430542, "step": 4580 }, { "epoch": 0.09164, "grad_norm": 2.625, "grad_norm_var": 0.0626953125, "learning_rate": 0.0001, "loss": 4.4986, "loss/crossentropy": 2.167198657989502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24552703648805618, "step": 4582 }, { "epoch": 0.09168, "grad_norm": 2.453125, "grad_norm_var": 0.06129150390625, "learning_rate": 0.0001, "loss": 4.8237, "loss/crossentropy": 2.213137984275818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2614079788327217, "step": 4584 }, { "epoch": 0.09172, "grad_norm": 2.265625, "grad_norm_var": 0.06194661458333333, "learning_rate": 0.0001, "loss": 4.977, "loss/crossentropy": 2.3586392998695374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527218610048294, "step": 4586 }, { "epoch": 0.09176, "grad_norm": 2.53125, "grad_norm_var": 0.06054280598958333, "learning_rate": 0.0001, "loss": 4.5118, "loss/crossentropy": 2.326598286628723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2684827446937561, "step": 4588 }, { "epoch": 0.0918, "grad_norm": 2.234375, "grad_norm_var": 0.055562337239583336, "learning_rate": 0.0001, "loss": 4.4296, "loss/crossentropy": 2.1365907192230225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705482095479965, "step": 4590 }, { "epoch": 0.09184, "grad_norm": 2.328125, "grad_norm_var": 0.056005859375, "learning_rate": 0.0001, "loss": 4.6895, "loss/crossentropy": 1.816661775112152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188858687877655, "step": 4592 }, { "epoch": 0.09188, "grad_norm": 2.203125, "grad_norm_var": 0.020731608072916668, "learning_rate": 0.0001, "loss": 4.5953, "loss/crossentropy": 2.1533923149108887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2372177392244339, "step": 4594 }, { "epoch": 0.09192, "grad_norm": 2.984375, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 4.9472, "loss/crossentropy": 2.2175614833831787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27555912733078003, "step": 4596 }, { "epoch": 0.09196, "grad_norm": 2.640625, "grad_norm_var": 0.04644775390625, "learning_rate": 0.0001, "loss": 5.11, "loss/crossentropy": 2.583792209625244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2970864772796631, "step": 4598 }, { "epoch": 0.092, "grad_norm": 2.40625, "grad_norm_var": 0.046751912434895834, "learning_rate": 0.0001, "loss": 4.5761, "loss/crossentropy": 2.227868676185608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259210504591465, "step": 4600 }, { "epoch": 0.09204, "grad_norm": 2.265625, "grad_norm_var": 0.047118123372395834, "learning_rate": 0.0001, "loss": 4.4125, "loss/crossentropy": 2.146941900253296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26443855464458466, "step": 4602 }, { "epoch": 0.09208, "grad_norm": 2.296875, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 4.8655, "loss/crossentropy": 2.129795551300049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2451881766319275, "step": 4604 }, { "epoch": 0.09212, "grad_norm": 2.8125, "grad_norm_var": 0.0506256103515625, "learning_rate": 0.0001, "loss": 4.895, "loss/crossentropy": 2.2696213722229004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26037923991680145, "step": 4606 }, { "epoch": 0.09216, "grad_norm": 3.03125, "grad_norm_var": 0.07105712890625, "learning_rate": 0.0001, "loss": 4.7424, "loss/crossentropy": 2.1916056871414185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2648691013455391, "step": 4608 }, { "epoch": 0.0922, "grad_norm": 2.15625, "grad_norm_var": 0.07119038899739584, "learning_rate": 0.0001, "loss": 4.5021, "loss/crossentropy": 1.975761890411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22248996049165726, "step": 4610 }, { "epoch": 0.09224, "grad_norm": 2.265625, "grad_norm_var": 0.053511555989583334, "learning_rate": 0.0001, "loss": 4.6596, "loss/crossentropy": 2.403375506401062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24553030729293823, "step": 4612 }, { "epoch": 0.09228, "grad_norm": 2.390625, "grad_norm_var": 0.049637858072916666, "learning_rate": 0.0001, "loss": 4.4542, "loss/crossentropy": 1.9468475580215454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24023501574993134, "step": 4614 }, { "epoch": 0.09232, "grad_norm": 2.40625, "grad_norm_var": 0.0534088134765625, "learning_rate": 0.0001, "loss": 4.8095, "loss/crossentropy": 2.013557195663452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23829826712608337, "step": 4616 }, { "epoch": 0.09236, "grad_norm": 2.265625, "grad_norm_var": 0.0519439697265625, "learning_rate": 0.0001, "loss": 4.5736, "loss/crossentropy": 2.0552549958229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2391991689801216, "step": 4618 }, { "epoch": 0.0924, "grad_norm": 2.28125, "grad_norm_var": 0.053694661458333334, "learning_rate": 0.0001, "loss": 4.4091, "loss/crossentropy": 1.8311110734939575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23821169883012772, "step": 4620 }, { "epoch": 0.09244, "grad_norm": 2.28125, "grad_norm_var": 0.04431966145833333, "learning_rate": 0.0001, "loss": 4.5034, "loss/crossentropy": 2.0346454977989197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23688799142837524, "step": 4622 }, { "epoch": 0.09248, "grad_norm": 2.484375, "grad_norm_var": 0.016304524739583333, "learning_rate": 0.0001, "loss": 4.5329, "loss/crossentropy": 1.8475716710090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23444947600364685, "step": 4624 }, { "epoch": 0.09252, "grad_norm": 2.328125, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 4.4315, "loss/crossentropy": 2.075626790523529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2663211151957512, "step": 4626 }, { "epoch": 0.09256, "grad_norm": 2.46875, "grad_norm_var": 0.0130035400390625, "learning_rate": 0.0001, "loss": 4.9214, "loss/crossentropy": 2.413028359413147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29516373574733734, "step": 4628 }, { "epoch": 0.0926, "grad_norm": 2.53125, "grad_norm_var": 0.014143880208333333, "learning_rate": 0.0001, "loss": 4.6878, "loss/crossentropy": 1.8549358248710632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23275860399007797, "step": 4630 }, { "epoch": 0.09264, "grad_norm": 2.390625, "grad_norm_var": 0.0076405843098958336, "learning_rate": 0.0001, "loss": 4.8594, "loss/crossentropy": 2.273250460624695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2511187419295311, "step": 4632 }, { "epoch": 0.09268, "grad_norm": 2.4375, "grad_norm_var": 0.013093058268229167, "learning_rate": 0.0001, "loss": 4.336, "loss/crossentropy": 1.929758369922638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25462278723716736, "step": 4634 }, { "epoch": 0.09272, "grad_norm": 2.296875, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 4.5888, "loss/crossentropy": 2.0929598212242126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23592843115329742, "step": 4636 }, { "epoch": 0.09276, "grad_norm": 2.21875, "grad_norm_var": 0.015282185872395833, "learning_rate": 0.0001, "loss": 4.4595, "loss/crossentropy": 1.973824143409729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24049720913171768, "step": 4638 }, { "epoch": 0.0928, "grad_norm": 2.25, "grad_norm_var": 0.014207967122395833, "learning_rate": 0.0001, "loss": 4.2606, "loss/crossentropy": 1.8606626987457275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21698658913373947, "step": 4640 }, { "epoch": 0.09284, "grad_norm": 2.5625, "grad_norm_var": 0.01875, "learning_rate": 0.0001, "loss": 4.9827, "loss/crossentropy": 2.241386890411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2831972986459732, "step": 4642 }, { "epoch": 0.09288, "grad_norm": 2.34375, "grad_norm_var": 0.0166656494140625, "learning_rate": 0.0001, "loss": 4.7046, "loss/crossentropy": 2.2538920640945435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2482161968946457, "step": 4644 }, { "epoch": 0.09292, "grad_norm": 2.578125, "grad_norm_var": 0.01812744140625, "learning_rate": 0.0001, "loss": 4.8557, "loss/crossentropy": 2.067206382751465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26665763556957245, "step": 4646 }, { "epoch": 0.09296, "grad_norm": 2.5625, "grad_norm_var": 0.021751912434895833, "learning_rate": 0.0001, "loss": 4.9474, "loss/crossentropy": 2.33401358127594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2752760946750641, "step": 4648 }, { "epoch": 0.093, "grad_norm": 2.4375, "grad_norm_var": 0.04914449055989583, "learning_rate": 0.0001, "loss": 4.8658, "loss/crossentropy": 2.1510735750198364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24588150531053543, "step": 4650 }, { "epoch": 0.09304, "grad_norm": 2.28125, "grad_norm_var": 0.049397786458333336, "learning_rate": 0.0001, "loss": 4.579, "loss/crossentropy": 2.086448848247528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.261296346783638, "step": 4652 }, { "epoch": 0.09308, "grad_norm": 2.5625, "grad_norm_var": 0.0446685791015625, "learning_rate": 0.0001, "loss": 4.7495, "loss/crossentropy": 1.5951193571090698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312234491109848, "step": 4654 }, { "epoch": 0.09312, "grad_norm": 2.328125, "grad_norm_var": 0.04052734375, "learning_rate": 0.0001, "loss": 4.2273, "loss/crossentropy": 2.0098360776901245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22354383766651154, "step": 4656 }, { "epoch": 0.09316, "grad_norm": 2.375, "grad_norm_var": 0.04079488118489583, "learning_rate": 0.0001, "loss": 4.6047, "loss/crossentropy": 2.263219714164734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26900506019592285, "step": 4658 }, { "epoch": 0.0932, "grad_norm": 2.828125, "grad_norm_var": 0.0485504150390625, "learning_rate": 0.0001, "loss": 4.5575, "loss/crossentropy": 1.9481555819511414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24665354192256927, "step": 4660 }, { "epoch": 0.09324, "grad_norm": 2.265625, "grad_norm_var": 0.04988505045572917, "learning_rate": 0.0001, "loss": 4.5895, "loss/crossentropy": 1.8751367926597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24356309324502945, "step": 4662 }, { "epoch": 0.09328, "grad_norm": 2.28125, "grad_norm_var": 0.06110026041666667, "learning_rate": 0.0001, "loss": 4.7739, "loss/crossentropy": 2.343896746635437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2558425962924957, "step": 4664 }, { "epoch": 0.09332, "grad_norm": 2.40625, "grad_norm_var": 0.032811482747395836, "learning_rate": 0.0001, "loss": 4.779, "loss/crossentropy": 2.067797303199768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25030098110437393, "step": 4666 }, { "epoch": 0.09336, "grad_norm": 2.359375, "grad_norm_var": 0.029816691080729166, "learning_rate": 0.0001, "loss": 4.7427, "loss/crossentropy": 2.4221161603927612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26934675872325897, "step": 4668 }, { "epoch": 0.0934, "grad_norm": 2.8125, "grad_norm_var": 0.038182576497395836, "learning_rate": 0.0001, "loss": 4.845, "loss/crossentropy": 2.2733672857284546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536270022392273, "step": 4670 }, { "epoch": 0.09344, "grad_norm": 2.390625, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 4.7721, "loss/crossentropy": 2.179157257080078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24323320388793945, "step": 4672 }, { "epoch": 0.09348, "grad_norm": 2.578125, "grad_norm_var": 0.03875325520833333, "learning_rate": 0.0001, "loss": 4.8574, "loss/crossentropy": 2.3613970279693604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567686140537262, "step": 4674 }, { "epoch": 0.09352, "grad_norm": 2.5, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 4.787, "loss/crossentropy": 2.0755810141563416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349473536014557, "step": 4676 }, { "epoch": 0.09356, "grad_norm": 2.25, "grad_norm_var": 0.03134358723958333, "learning_rate": 0.0001, "loss": 4.6235, "loss/crossentropy": 1.9971619248390198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22000454366207123, "step": 4678 }, { "epoch": 0.0936, "grad_norm": 2.3125, "grad_norm_var": 0.020068359375, "learning_rate": 0.0001, "loss": 4.5307, "loss/crossentropy": 1.9419977068901062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21052303910255432, "step": 4680 }, { "epoch": 0.09364, "grad_norm": 2.3125, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 4.6314, "loss/crossentropy": 2.1175334453582764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23567666858434677, "step": 4682 }, { "epoch": 0.09368, "grad_norm": 2.40625, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 4.5955, "loss/crossentropy": 2.0300605297088623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29177258908748627, "step": 4684 }, { "epoch": 0.09372, "grad_norm": 2.296875, "grad_norm_var": 0.009228515625, "learning_rate": 0.0001, "loss": 4.5583, "loss/crossentropy": 2.052473723888397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22480525821447372, "step": 4686 }, { "epoch": 0.09376, "grad_norm": 2.359375, "grad_norm_var": 0.008275349934895834, "learning_rate": 0.0001, "loss": 4.4925, "loss/crossentropy": 2.2506834268569946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26876674592494965, "step": 4688 }, { "epoch": 0.0938, "grad_norm": 2.125, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 4.291, "loss/crossentropy": 1.930393099784851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22246946394443512, "step": 4690 }, { "epoch": 0.09384, "grad_norm": 2.5, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.9317, "loss/crossentropy": 2.5477651357650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27808643877506256, "step": 4692 }, { "epoch": 0.09388, "grad_norm": 2.15625, "grad_norm_var": 0.009033203125, "learning_rate": 0.0001, "loss": 4.5313, "loss/crossentropy": 2.1059221625328064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253792941570282, "step": 4694 }, { "epoch": 0.09392, "grad_norm": 2.265625, "grad_norm_var": 0.0091217041015625, "learning_rate": 0.0001, "loss": 4.607, "loss/crossentropy": 2.0058358907699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23942459374666214, "step": 4696 }, { "epoch": 0.09396, "grad_norm": 2.25, "grad_norm_var": 0.0103515625, "learning_rate": 0.0001, "loss": 4.2024, "loss/crossentropy": 1.9118947982788086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2300974577665329, "step": 4698 }, { "epoch": 0.094, "grad_norm": 2.5625, "grad_norm_var": 0.013337198893229167, "learning_rate": 0.0001, "loss": 4.6541, "loss/crossentropy": 2.1172733902931213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24785596132278442, "step": 4700 }, { "epoch": 0.09404, "grad_norm": 2.5, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 4.9005, "loss/crossentropy": 2.0064170956611633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25045372545719147, "step": 4702 }, { "epoch": 0.09408, "grad_norm": 2.3125, "grad_norm_var": 0.015738932291666667, "learning_rate": 0.0001, "loss": 4.8139, "loss/crossentropy": 2.5816656351089478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27938568592071533, "step": 4704 }, { "epoch": 0.09412, "grad_norm": 2.25, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 4.866, "loss/crossentropy": 2.5768171548843384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27975398302078247, "step": 4706 }, { "epoch": 0.09416, "grad_norm": 2.234375, "grad_norm_var": 0.0153717041015625, "learning_rate": 0.0001, "loss": 4.6075, "loss/crossentropy": 2.323893189430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2608431279659271, "step": 4708 }, { "epoch": 0.0942, "grad_norm": 2.265625, "grad_norm_var": 0.01357421875, "learning_rate": 0.0001, "loss": 4.3564, "loss/crossentropy": 1.7910810708999634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21380099654197693, "step": 4710 }, { "epoch": 0.09424, "grad_norm": 2.21875, "grad_norm_var": 0.015848795572916668, "learning_rate": 0.0001, "loss": 4.6374, "loss/crossentropy": 2.214052677154541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24833911657333374, "step": 4712 }, { "epoch": 0.09428, "grad_norm": 2.40625, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 4.9987, "loss/crossentropy": 2.0850380063056946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419341504573822, "step": 4714 }, { "epoch": 0.09432, "grad_norm": 2.359375, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 4.2961, "loss/crossentropy": 2.0707927346229553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2499033510684967, "step": 4716 }, { "epoch": 0.09436, "grad_norm": 2.3125, "grad_norm_var": 0.0125396728515625, "learning_rate": 0.0001, "loss": 4.4057, "loss/crossentropy": 2.11221444606781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25916673243045807, "step": 4718 }, { "epoch": 0.0944, "grad_norm": 2.390625, "grad_norm_var": 0.012718709309895833, "learning_rate": 0.0001, "loss": 4.6057, "loss/crossentropy": 1.8400230407714844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22771906107664108, "step": 4720 }, { "epoch": 0.09444, "grad_norm": 2.28125, "grad_norm_var": 0.008234659830729166, "learning_rate": 0.0001, "loss": 4.4929, "loss/crossentropy": 2.2271865606307983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2574647441506386, "step": 4722 }, { "epoch": 0.09448, "grad_norm": 2.203125, "grad_norm_var": 0.008649698893229167, "learning_rate": 0.0001, "loss": 4.4629, "loss/crossentropy": 2.4532920122146606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27140843868255615, "step": 4724 }, { "epoch": 0.09452, "grad_norm": 2.375, "grad_norm_var": 0.009504191080729167, "learning_rate": 0.0001, "loss": 4.731, "loss/crossentropy": 1.951303780078888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333502620458603, "step": 4726 }, { "epoch": 0.09456, "grad_norm": 2.40625, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 4.6334, "loss/crossentropy": 2.209821343421936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24776015430688858, "step": 4728 }, { "epoch": 0.0946, "grad_norm": 2.3125, "grad_norm_var": 0.010872395833333333, "learning_rate": 0.0001, "loss": 4.861, "loss/crossentropy": 2.434941530227661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2747645229101181, "step": 4730 }, { "epoch": 0.09464, "grad_norm": 2.1875, "grad_norm_var": 0.010602823893229167, "learning_rate": 0.0001, "loss": 4.6145, "loss/crossentropy": 2.051652252674103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24515582621097565, "step": 4732 }, { "epoch": 0.09468, "grad_norm": 2.265625, "grad_norm_var": 0.009797159830729167, "learning_rate": 0.0001, "loss": 4.6804, "loss/crossentropy": 2.0039377212524414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23073314130306244, "step": 4734 }, { "epoch": 0.09472, "grad_norm": 2.4375, "grad_norm_var": 0.19903971354166666, "learning_rate": 0.0001, "loss": 4.8333, "loss/crossentropy": 2.166410982608795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25871724635362625, "step": 4736 }, { "epoch": 0.09476, "grad_norm": 2.3125, "grad_norm_var": 0.2066802978515625, "learning_rate": 0.0001, "loss": 4.2739, "loss/crossentropy": 1.9289153218269348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23183659464120865, "step": 4738 }, { "epoch": 0.0948, "grad_norm": 2.46875, "grad_norm_var": 0.20640360514322917, "learning_rate": 0.0001, "loss": 4.5903, "loss/crossentropy": 2.270771861076355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26335832476615906, "step": 4740 }, { "epoch": 0.09484, "grad_norm": 2.4375, "grad_norm_var": 0.206494140625, "learning_rate": 0.0001, "loss": 4.7142, "loss/crossentropy": 2.395651936531067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28527122735977173, "step": 4742 }, { "epoch": 0.09488, "grad_norm": 2.390625, "grad_norm_var": 0.20437723795572918, "learning_rate": 0.0001, "loss": 4.5083, "loss/crossentropy": 1.8597867488861084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224149189889431, "step": 4744 }, { "epoch": 0.09492, "grad_norm": 2.265625, "grad_norm_var": 0.20693257649739583, "learning_rate": 0.0001, "loss": 4.4415, "loss/crossentropy": 1.7795116305351257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23233074694871902, "step": 4746 }, { "epoch": 0.09496, "grad_norm": 2.4375, "grad_norm_var": 0.20255533854166666, "learning_rate": 0.0001, "loss": 4.7749, "loss/crossentropy": 1.9449282884597778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2524118423461914, "step": 4748 }, { "epoch": 0.095, "grad_norm": 2.53125, "grad_norm_var": 0.2005035400390625, "learning_rate": 0.0001, "loss": 4.7047, "loss/crossentropy": 2.195169448852539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349853590130806, "step": 4750 }, { "epoch": 0.09504, "grad_norm": 2.671875, "grad_norm_var": 0.022468058268229167, "learning_rate": 0.0001, "loss": 4.5268, "loss/crossentropy": 1.6628928184509277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975274682044983, "step": 4752 }, { "epoch": 0.09508, "grad_norm": 2.25, "grad_norm_var": 0.018488566080729168, "learning_rate": 0.0001, "loss": 4.5862, "loss/crossentropy": 2.0991236567497253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23341374844312668, "step": 4754 }, { "epoch": 0.09512, "grad_norm": 2.40625, "grad_norm_var": 0.017024739583333334, "learning_rate": 0.0001, "loss": 4.8338, "loss/crossentropy": 2.350286066532135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2648291736841202, "step": 4756 }, { "epoch": 0.09516, "grad_norm": 2.375, "grad_norm_var": 0.017430623372395832, "learning_rate": 0.0001, "loss": 4.5788, "loss/crossentropy": 2.02384877204895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24653150886297226, "step": 4758 }, { "epoch": 0.0952, "grad_norm": 2.203125, "grad_norm_var": 0.01734619140625, "learning_rate": 0.0001, "loss": 4.7052, "loss/crossentropy": 2.264349341392517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25881223380565643, "step": 4760 }, { "epoch": 0.09524, "grad_norm": 2.890625, "grad_norm_var": 0.03779296875, "learning_rate": 0.0001, "loss": 4.8216, "loss/crossentropy": 2.197356939315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565019279718399, "step": 4762 }, { "epoch": 0.09528, "grad_norm": 2.515625, "grad_norm_var": 0.038655598958333336, "learning_rate": 0.0001, "loss": 4.3849, "loss/crossentropy": 1.8794787526130676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21771979331970215, "step": 4764 }, { "epoch": 0.09532, "grad_norm": 2.3125, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 4.7391, "loss/crossentropy": 2.107520580291748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2350333333015442, "step": 4766 }, { "epoch": 0.09536, "grad_norm": 2.3125, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 4.5194, "loss/crossentropy": 2.082249402999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24860269576311111, "step": 4768 }, { "epoch": 0.0954, "grad_norm": 2.296875, "grad_norm_var": 0.03062744140625, "learning_rate": 0.0001, "loss": 4.3985, "loss/crossentropy": 1.9632289409637451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24099770188331604, "step": 4770 }, { "epoch": 0.09544, "grad_norm": 2.359375, "grad_norm_var": 0.02994384765625, "learning_rate": 0.0001, "loss": 4.4526, "loss/crossentropy": 2.3403327465057373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2573448717594147, "step": 4772 }, { "epoch": 0.09548, "grad_norm": 2.40625, "grad_norm_var": 0.03181050618489583, "learning_rate": 0.0001, "loss": 4.7711, "loss/crossentropy": 2.302455425262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28050975501537323, "step": 4774 }, { "epoch": 0.09552, "grad_norm": 2.28125, "grad_norm_var": 0.031103515625, "learning_rate": 0.0001, "loss": 4.6092, "loss/crossentropy": 2.2875213623046875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23910009860992432, "step": 4776 }, { "epoch": 0.09556, "grad_norm": 2.765625, "grad_norm_var": 0.022721354166666666, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 2.0440263748168945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24640783667564392, "step": 4778 }, { "epoch": 0.0956, "grad_norm": 2.53125, "grad_norm_var": 0.02613525390625, "learning_rate": 0.0001, "loss": 4.6328, "loss/crossentropy": 2.1597142219543457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24566050618886948, "step": 4780 }, { "epoch": 0.09564, "grad_norm": 2.203125, "grad_norm_var": 0.027765909830729168, "learning_rate": 0.0001, "loss": 4.7138, "loss/crossentropy": 2.2846235036849976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598261833190918, "step": 4782 }, { "epoch": 0.09568, "grad_norm": 2.1875, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 4.4527, "loss/crossentropy": 2.032243251800537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2488306611776352, "step": 4784 }, { "epoch": 0.09572, "grad_norm": 2.5, "grad_norm_var": 0.037230428059895834, "learning_rate": 0.0001, "loss": 4.7651, "loss/crossentropy": 2.4999172687530518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25949685275554657, "step": 4786 }, { "epoch": 0.09576, "grad_norm": 2.296875, "grad_norm_var": 0.0375152587890625, "learning_rate": 0.0001, "loss": 4.6068, "loss/crossentropy": 2.1610575914382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2414923906326294, "step": 4788 }, { "epoch": 0.0958, "grad_norm": 2.390625, "grad_norm_var": 0.03135477701822917, "learning_rate": 0.0001, "loss": 4.5309, "loss/crossentropy": 1.8679919838905334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21853071451187134, "step": 4790 }, { "epoch": 0.09584, "grad_norm": 2.390625, "grad_norm_var": 0.031281534830729166, "learning_rate": 0.0001, "loss": 4.697, "loss/crossentropy": 2.072615623474121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24943216145038605, "step": 4792 }, { "epoch": 0.09588, "grad_norm": 2.765625, "grad_norm_var": 0.032624308268229166, "learning_rate": 0.0001, "loss": 4.5555, "loss/crossentropy": 2.420115113258362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2840816229581833, "step": 4794 }, { "epoch": 0.09592, "grad_norm": 2.6875, "grad_norm_var": 0.0455078125, "learning_rate": 0.0001, "loss": 5.102, "loss/crossentropy": 2.2809360027313232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2739071249961853, "step": 4796 }, { "epoch": 0.09596, "grad_norm": 2.328125, "grad_norm_var": 0.04319559733072917, "learning_rate": 0.0001, "loss": 4.5418, "loss/crossentropy": 2.1809465289115906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24680403620004654, "step": 4798 }, { "epoch": 0.096, "grad_norm": 2.15625, "grad_norm_var": 0.044611612955729164, "learning_rate": 0.0001, "loss": 4.6994, "loss/crossentropy": 1.9710316061973572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598233222961426, "step": 4800 }, { "epoch": 0.09604, "grad_norm": 2.40625, "grad_norm_var": 0.035477701822916666, "learning_rate": 0.0001, "loss": 4.9811, "loss/crossentropy": 2.4378572702407837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2665309011936188, "step": 4802 }, { "epoch": 0.09608, "grad_norm": 2.390625, "grad_norm_var": 0.03759358723958333, "learning_rate": 0.0001, "loss": 4.604, "loss/crossentropy": 1.8578800559043884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24051420390605927, "step": 4804 }, { "epoch": 0.09612, "grad_norm": 2.40625, "grad_norm_var": 0.034520467122395836, "learning_rate": 0.0001, "loss": 4.4567, "loss/crossentropy": 2.191601037979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25017624348402023, "step": 4806 }, { "epoch": 0.09616, "grad_norm": 2.296875, "grad_norm_var": 0.038374837239583334, "learning_rate": 0.0001, "loss": 4.5777, "loss/crossentropy": 2.2077550888061523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31426818668842316, "step": 4808 }, { "epoch": 0.0962, "grad_norm": 2.390625, "grad_norm_var": 0.030659993489583332, "learning_rate": 0.0001, "loss": 4.917, "loss/crossentropy": 2.2983503341674805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26998060941696167, "step": 4810 }, { "epoch": 0.09624, "grad_norm": 2.703125, "grad_norm_var": 0.020824178059895834, "learning_rate": 0.0001, "loss": 4.8691, "loss/crossentropy": 2.0875505208969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2673826217651367, "step": 4812 }, { "epoch": 0.09628, "grad_norm": 2.03125, "grad_norm_var": 0.031403605143229166, "learning_rate": 0.0001, "loss": 4.0736, "loss/crossentropy": 1.9439318776130676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2379719465970993, "step": 4814 }, { "epoch": 0.09632, "grad_norm": 2.359375, "grad_norm_var": 0.03611551920572917, "learning_rate": 0.0001, "loss": 4.4698, "loss/crossentropy": 1.9822518229484558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24425340443849564, "step": 4816 }, { "epoch": 0.09636, "grad_norm": 2.296875, "grad_norm_var": 0.03902994791666667, "learning_rate": 0.0001, "loss": 4.475, "loss/crossentropy": 2.0238336324691772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2409110590815544, "step": 4818 }, { "epoch": 0.0964, "grad_norm": 2.34375, "grad_norm_var": 0.03707275390625, "learning_rate": 0.0001, "loss": 4.5401, "loss/crossentropy": 2.454450249671936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26127950847148895, "step": 4820 }, { "epoch": 0.09644, "grad_norm": 2.296875, "grad_norm_var": 0.038407389322916666, "learning_rate": 0.0001, "loss": 4.5682, "loss/crossentropy": 2.210579752922058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23662539571523666, "step": 4822 }, { "epoch": 0.09648, "grad_norm": 2.28125, "grad_norm_var": 0.034601847330729164, "learning_rate": 0.0001, "loss": 4.5527, "loss/crossentropy": 1.8737664222717285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22884630411863327, "step": 4824 }, { "epoch": 0.09652, "grad_norm": 2.484375, "grad_norm_var": 0.0335357666015625, "learning_rate": 0.0001, "loss": 4.6221, "loss/crossentropy": 2.1939562559127808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25145241618156433, "step": 4826 }, { "epoch": 0.09656, "grad_norm": 2.328125, "grad_norm_var": 0.0278472900390625, "learning_rate": 0.0001, "loss": 4.6685, "loss/crossentropy": 2.133625030517578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25248992443084717, "step": 4828 }, { "epoch": 0.0966, "grad_norm": 2.328125, "grad_norm_var": 0.0197906494140625, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.259738326072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24727293848991394, "step": 4830 }, { "epoch": 0.09664, "grad_norm": 2.984375, "grad_norm_var": 0.043635050455729164, "learning_rate": 0.0001, "loss": 4.7497, "loss/crossentropy": 2.0186268091201782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27940231561660767, "step": 4832 }, { "epoch": 0.09668, "grad_norm": 2.234375, "grad_norm_var": 0.04185282389322917, "learning_rate": 0.0001, "loss": 4.5867, "loss/crossentropy": 1.9686395525932312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23021705448627472, "step": 4834 }, { "epoch": 0.09672, "grad_norm": 2.296875, "grad_norm_var": 0.0424224853515625, "learning_rate": 0.0001, "loss": 4.6157, "loss/crossentropy": 2.1677820682525635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2766892910003662, "step": 4836 }, { "epoch": 0.09676, "grad_norm": 2.1875, "grad_norm_var": 0.04670308430989583, "learning_rate": 0.0001, "loss": 4.3751, "loss/crossentropy": 1.8458788990974426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23294126987457275, "step": 4838 }, { "epoch": 0.0968, "grad_norm": 2.640625, "grad_norm_var": 0.0495269775390625, "learning_rate": 0.0001, "loss": 4.8931, "loss/crossentropy": 1.7898097038269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23436476290225983, "step": 4840 }, { "epoch": 0.09684, "grad_norm": 2.375, "grad_norm_var": 0.04843343098958333, "learning_rate": 0.0001, "loss": 4.5169, "loss/crossentropy": 2.4594497680664062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2618473023176193, "step": 4842 }, { "epoch": 0.09688, "grad_norm": 2.4375, "grad_norm_var": 0.04752197265625, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.061558425426483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523345798254013, "step": 4844 }, { "epoch": 0.09692, "grad_norm": 2.484375, "grad_norm_var": 0.04462890625, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.060658574104309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2493506520986557, "step": 4846 }, { "epoch": 0.09696, "grad_norm": 2.25, "grad_norm_var": 0.016169230143229168, "learning_rate": 0.0001, "loss": 4.8028, "loss/crossentropy": 2.201029062271118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2629493921995163, "step": 4848 }, { "epoch": 0.097, "grad_norm": 2.34375, "grad_norm_var": 0.01519775390625, "learning_rate": 0.0001, "loss": 4.7404, "loss/crossentropy": 2.199273705482483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27054519951343536, "step": 4850 }, { "epoch": 0.09704, "grad_norm": 2.109375, "grad_norm_var": 0.0189453125, "learning_rate": 0.0001, "loss": 4.3008, "loss/crossentropy": 2.2466784715652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2678111642599106, "step": 4852 }, { "epoch": 0.09708, "grad_norm": 2.71875, "grad_norm_var": 0.022054036458333332, "learning_rate": 0.0001, "loss": 4.6762, "loss/crossentropy": 1.9152815341949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2441672906279564, "step": 4854 }, { "epoch": 0.09712, "grad_norm": 2.21875, "grad_norm_var": 0.017577107747395834, "learning_rate": 0.0001, "loss": 4.3818, "loss/crossentropy": 2.092438578605652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24061349034309387, "step": 4856 }, { "epoch": 0.09716, "grad_norm": 2.4375, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 4.3224, "loss/crossentropy": 1.8620384335517883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23738765716552734, "step": 4858 }, { "epoch": 0.0972, "grad_norm": 2.953125, "grad_norm_var": 0.04138997395833333, "learning_rate": 0.0001, "loss": 4.8798, "loss/crossentropy": 2.199701428413391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26521213352680206, "step": 4860 }, { "epoch": 0.09724, "grad_norm": 2.53125, "grad_norm_var": 0.042170206705729164, "learning_rate": 0.0001, "loss": 4.7598, "loss/crossentropy": 2.3637821674346924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2707534506917, "step": 4862 }, { "epoch": 0.09728, "grad_norm": 2.21875, "grad_norm_var": 0.045491536458333336, "learning_rate": 0.0001, "loss": 4.5184, "loss/crossentropy": 2.207235813140869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28080131858587265, "step": 4864 }, { "epoch": 0.09732, "grad_norm": 2.265625, "grad_norm_var": 0.0466461181640625, "learning_rate": 0.0001, "loss": 4.3798, "loss/crossentropy": 1.901595950126648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23459318280220032, "step": 4866 }, { "epoch": 0.09736, "grad_norm": 2.3125, "grad_norm_var": 0.042769368489583334, "learning_rate": 0.0001, "loss": 4.4289, "loss/crossentropy": 2.298312544822693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2580094337463379, "step": 4868 }, { "epoch": 0.0974, "grad_norm": 2.3125, "grad_norm_var": 0.03658854166666667, "learning_rate": 0.0001, "loss": 4.7576, "loss/crossentropy": 2.29680597782135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24749226868152618, "step": 4870 }, { "epoch": 0.09744, "grad_norm": 2.25, "grad_norm_var": 0.0361968994140625, "learning_rate": 0.0001, "loss": 4.5839, "loss/crossentropy": 2.1169378757476807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565944790840149, "step": 4872 }, { "epoch": 0.09748, "grad_norm": 2.140625, "grad_norm_var": 0.04035542805989583, "learning_rate": 0.0001, "loss": 4.5364, "loss/crossentropy": 2.0863184928894043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527791038155556, "step": 4874 }, { "epoch": 0.09752, "grad_norm": 2.296875, "grad_norm_var": 0.015625, "learning_rate": 0.0001, "loss": 4.6176, "loss/crossentropy": 2.146193563938141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23968011140823364, "step": 4876 }, { "epoch": 0.09756, "grad_norm": 2.375, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.5598, "loss/crossentropy": 2.1125508546829224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2498009204864502, "step": 4878 }, { "epoch": 0.0976, "grad_norm": 2.375, "grad_norm_var": 0.004130045572916667, "learning_rate": 0.0001, "loss": 4.772, "loss/crossentropy": 2.13166081905365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2640175372362137, "step": 4880 }, { "epoch": 0.09764, "grad_norm": 2.15625, "grad_norm_var": 0.0051910400390625, "learning_rate": 0.0001, "loss": 4.5276, "loss/crossentropy": 2.047860622406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980721086263657, "step": 4882 }, { "epoch": 0.09768, "grad_norm": 2.515625, "grad_norm_var": 0.010477701822916666, "learning_rate": 0.0001, "loss": 4.8519, "loss/crossentropy": 2.362569808959961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27908293902873993, "step": 4884 }, { "epoch": 0.09772, "grad_norm": 2.109375, "grad_norm_var": 0.012398274739583333, "learning_rate": 0.0001, "loss": 4.2709, "loss/crossentropy": 1.8781500458717346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341010719537735, "step": 4886 }, { "epoch": 0.09776, "grad_norm": 2.390625, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 4.9408, "loss/crossentropy": 2.1138893365859985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2533891350030899, "step": 4888 }, { "epoch": 0.0978, "grad_norm": 2.484375, "grad_norm_var": 0.0188140869140625, "learning_rate": 0.0001, "loss": 4.5981, "loss/crossentropy": 2.2212090492248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.279159352183342, "step": 4890 }, { "epoch": 0.09784, "grad_norm": 2.453125, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 4.7245, "loss/crossentropy": 1.9747707843780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24078013002872467, "step": 4892 }, { "epoch": 0.09788, "grad_norm": 2.53125, "grad_norm_var": 0.020015462239583334, "learning_rate": 0.0001, "loss": 4.4051, "loss/crossentropy": 1.8607316613197327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25342857837677, "step": 4894 }, { "epoch": 0.09792, "grad_norm": 2.359375, "grad_norm_var": 0.020963541666666665, "learning_rate": 0.0001, "loss": 4.6759, "loss/crossentropy": 2.245271682739258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24260863661766052, "step": 4896 }, { "epoch": 0.09796, "grad_norm": 2.390625, "grad_norm_var": 0.017560831705729165, "learning_rate": 0.0001, "loss": 4.5006, "loss/crossentropy": 2.0503702759742737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25429168343544006, "step": 4898 }, { "epoch": 0.098, "grad_norm": 2.3125, "grad_norm_var": 0.017236328125, "learning_rate": 0.0001, "loss": 4.421, "loss/crossentropy": 1.7784077525138855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2249627709388733, "step": 4900 }, { "epoch": 0.09804, "grad_norm": 2.390625, "grad_norm_var": 0.0114898681640625, "learning_rate": 0.0001, "loss": 4.5398, "loss/crossentropy": 1.9827336072921753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001181542873383, "step": 4902 }, { "epoch": 0.09808, "grad_norm": 2.53125, "grad_norm_var": 1.638996378580729, "learning_rate": 0.0001, "loss": 4.8149, "loss/crossentropy": 2.1003565788269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2550586014986038, "step": 4904 }, { "epoch": 0.09812, "grad_norm": 2.484375, "grad_norm_var": 1.6366933186848958, "learning_rate": 0.0001, "loss": 4.5099, "loss/crossentropy": 1.9565055966377258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25481177121400833, "step": 4906 }, { "epoch": 0.09816, "grad_norm": 2.234375, "grad_norm_var": 1.6479777018229167, "learning_rate": 0.0001, "loss": 4.6214, "loss/crossentropy": 2.1693456172943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23336851596832275, "step": 4908 }, { "epoch": 0.0982, "grad_norm": 2.40625, "grad_norm_var": 1.6428995768229167, "learning_rate": 0.0001, "loss": 4.7637, "loss/crossentropy": 2.050541341304779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25821831077337265, "step": 4910 }, { "epoch": 0.09824, "grad_norm": 2.40625, "grad_norm_var": 1.62906494140625, "learning_rate": 0.0001, "loss": 4.8515, "loss/crossentropy": 2.168497681617737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2674206495285034, "step": 4912 }, { "epoch": 0.09828, "grad_norm": 2.609375, "grad_norm_var": 1.6097320556640624, "learning_rate": 0.0001, "loss": 5.1667, "loss/crossentropy": 2.147577404975891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26900771260261536, "step": 4914 }, { "epoch": 0.09832, "grad_norm": 2.09375, "grad_norm_var": 1.6371734619140625, "learning_rate": 0.0001, "loss": 4.3958, "loss/crossentropy": 2.4436198472976685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2504816800355911, "step": 4916 }, { "epoch": 0.09836, "grad_norm": 2.25, "grad_norm_var": 1.6447265625, "learning_rate": 0.0001, "loss": 4.7244, "loss/crossentropy": 2.097359538078308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24005448818206787, "step": 4918 }, { "epoch": 0.0984, "grad_norm": 2.078125, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 4.2305, "loss/crossentropy": 1.8467384576797485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22932368516921997, "step": 4920 }, { "epoch": 0.09844, "grad_norm": 2.859375, "grad_norm_var": 0.04263916015625, "learning_rate": 0.0001, "loss": 4.4407, "loss/crossentropy": 2.1454135179519653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2634875178337097, "step": 4922 }, { "epoch": 0.09848, "grad_norm": 2.765625, "grad_norm_var": 0.05164286295572917, "learning_rate": 0.0001, "loss": 4.5775, "loss/crossentropy": 1.9837967157363892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980768024921417, "step": 4924 }, { "epoch": 0.09852, "grad_norm": 2.265625, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 4.6888, "loss/crossentropy": 2.099667489528656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23482084274291992, "step": 4926 }, { "epoch": 0.09856, "grad_norm": 3.125, "grad_norm_var": 0.08787333170572917, "learning_rate": 0.0001, "loss": 4.5685, "loss/crossentropy": 1.8467332124710083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.256004735827446, "step": 4928 }, { "epoch": 0.0986, "grad_norm": 2.53125, "grad_norm_var": 0.08548075358072917, "learning_rate": 0.0001, "loss": 4.4474, "loss/crossentropy": 2.036003887653351, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2377806007862091, "step": 4930 }, { "epoch": 0.09864, "grad_norm": 2.484375, "grad_norm_var": 0.07834370930989583, "learning_rate": 0.0001, "loss": 4.8258, "loss/crossentropy": 2.091457724571228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24746537953615189, "step": 4932 }, { "epoch": 0.09868, "grad_norm": 2.296875, "grad_norm_var": 0.07929280598958334, "learning_rate": 0.0001, "loss": 4.6583, "loss/crossentropy": 2.245227336883545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24141517281532288, "step": 4934 }, { "epoch": 0.09872, "grad_norm": 2.40625, "grad_norm_var": 0.0661285400390625, "learning_rate": 0.0001, "loss": 4.6514, "loss/crossentropy": 2.295682668685913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28035247325897217, "step": 4936 }, { "epoch": 0.09876, "grad_norm": 2.234375, "grad_norm_var": 0.05524800618489583, "learning_rate": 0.0001, "loss": 4.5525, "loss/crossentropy": 1.9251704812049866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23609354346990585, "step": 4938 }, { "epoch": 0.0988, "grad_norm": 2.5, "grad_norm_var": 0.048314412434895836, "learning_rate": 0.0001, "loss": 5.1583, "loss/crossentropy": 2.289652466773987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30698196589946747, "step": 4940 }, { "epoch": 0.09884, "grad_norm": 2.09375, "grad_norm_var": 0.0534820556640625, "learning_rate": 0.0001, "loss": 4.7759, "loss/crossentropy": 2.3339043855667114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2449272722005844, "step": 4942 }, { "epoch": 0.09888, "grad_norm": 2.203125, "grad_norm_var": 0.01949462890625, "learning_rate": 0.0001, "loss": 4.6678, "loss/crossentropy": 2.0610195994377136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2553517669439316, "step": 4944 }, { "epoch": 0.09892, "grad_norm": 2.203125, "grad_norm_var": 0.018122355143229168, "learning_rate": 0.0001, "loss": 4.5849, "loss/crossentropy": 2.1093825101852417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24767974764108658, "step": 4946 }, { "epoch": 0.09896, "grad_norm": 2.328125, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 4.3631, "loss/crossentropy": 2.1742878556251526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24640457332134247, "step": 4948 }, { "epoch": 0.099, "grad_norm": 2.390625, "grad_norm_var": 0.0144195556640625, "learning_rate": 0.0001, "loss": 4.8319, "loss/crossentropy": 2.4237486124038696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2610916793346405, "step": 4950 }, { "epoch": 0.09904, "grad_norm": 2.15625, "grad_norm_var": 0.015445963541666666, "learning_rate": 0.0001, "loss": 4.3397, "loss/crossentropy": 2.197754144668579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275979220867157, "step": 4952 }, { "epoch": 0.09908, "grad_norm": 2.21875, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 4.4533, "loss/crossentropy": 2.267225503921509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23391032963991165, "step": 4954 }, { "epoch": 0.09912, "grad_norm": 2.265625, "grad_norm_var": 0.009504191080729167, "learning_rate": 0.0001, "loss": 4.47, "loss/crossentropy": 2.04893159866333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23763196170330048, "step": 4956 }, { "epoch": 0.09916, "grad_norm": 2.4375, "grad_norm_var": 0.010716756184895834, "learning_rate": 0.0001, "loss": 4.97, "loss/crossentropy": 2.4489223957061768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27698180079460144, "step": 4958 }, { "epoch": 0.0992, "grad_norm": 2.328125, "grad_norm_var": 0.011930338541666667, "learning_rate": 0.0001, "loss": 4.8604, "loss/crossentropy": 2.3654375076293945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495984137058258, "step": 4960 }, { "epoch": 0.09924, "grad_norm": 2.328125, "grad_norm_var": 0.011812337239583333, "learning_rate": 0.0001, "loss": 4.7671, "loss/crossentropy": 1.8583308458328247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21832667291164398, "step": 4962 }, { "epoch": 0.09928, "grad_norm": 2.265625, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 4.4951, "loss/crossentropy": 2.0762425661087036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280900850892067, "step": 4964 }, { "epoch": 0.09932, "grad_norm": 2.15625, "grad_norm_var": 0.012532552083333334, "learning_rate": 0.0001, "loss": 4.7461, "loss/crossentropy": 2.228640913963318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26595622301101685, "step": 4966 }, { "epoch": 0.09936, "grad_norm": 2.234375, "grad_norm_var": 0.0111236572265625, "learning_rate": 0.0001, "loss": 4.521, "loss/crossentropy": 2.1447466611862183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542327791452408, "step": 4968 }, { "epoch": 0.0994, "grad_norm": 2.09375, "grad_norm_var": 0.01226806640625, "learning_rate": 0.0001, "loss": 4.3959, "loss/crossentropy": 2.0750887989997864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362435683608055, "step": 4970 }, { "epoch": 0.09944, "grad_norm": 2.328125, "grad_norm_var": 0.009666951497395833, "learning_rate": 0.0001, "loss": 4.6705, "loss/crossentropy": 1.9413353204727173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24418477714061737, "step": 4972 }, { "epoch": 0.09948, "grad_norm": 2.265625, "grad_norm_var": 0.00904541015625, "learning_rate": 0.0001, "loss": 4.609, "loss/crossentropy": 2.102766752243042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24170882254838943, "step": 4974 }, { "epoch": 0.09952, "grad_norm": 2.359375, "grad_norm_var": 0.0090240478515625, "learning_rate": 0.0001, "loss": 4.7568, "loss/crossentropy": 2.431061267852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855495512485504, "step": 4976 }, { "epoch": 0.09956, "grad_norm": 2.453125, "grad_norm_var": 0.010448201497395834, "learning_rate": 0.0001, "loss": 4.7449, "loss/crossentropy": 1.9656312465667725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26410341262817383, "step": 4978 }, { "epoch": 0.0996, "grad_norm": 2.578125, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 5.0286, "loss/crossentropy": 2.2365923523902893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27946531772613525, "step": 4980 }, { "epoch": 0.09964, "grad_norm": 2.4375, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 4.5791, "loss/crossentropy": 2.203595757484436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25247688591480255, "step": 4982 }, { "epoch": 0.09968, "grad_norm": 2.296875, "grad_norm_var": 0.0163482666015625, "learning_rate": 0.0001, "loss": 4.6824, "loss/crossentropy": 2.1462446451187134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2539386674761772, "step": 4984 }, { "epoch": 0.09972, "grad_norm": 2.25, "grad_norm_var": 0.014127604166666667, "learning_rate": 0.0001, "loss": 4.7121, "loss/crossentropy": 2.4518587589263916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24366125464439392, "step": 4986 }, { "epoch": 0.09976, "grad_norm": 2.25, "grad_norm_var": 0.01402587890625, "learning_rate": 0.0001, "loss": 4.5131, "loss/crossentropy": 2.003869950771332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22874485701322556, "step": 4988 }, { "epoch": 0.0998, "grad_norm": 2.328125, "grad_norm_var": 0.017476399739583332, "learning_rate": 0.0001, "loss": 4.3221, "loss/crossentropy": 2.0251912474632263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240670546889305, "step": 4990 }, { "epoch": 0.09984, "grad_norm": 2.140625, "grad_norm_var": 0.020166015625, "learning_rate": 0.0001, "loss": 4.5297, "loss/crossentropy": 2.199779748916626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22916750609874725, "step": 4992 }, { "epoch": 0.09988, "grad_norm": 2.40625, "grad_norm_var": 0.0207672119140625, "learning_rate": 0.0001, "loss": 4.8066, "loss/crossentropy": 2.3852288722991943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26489999890327454, "step": 4994 }, { "epoch": 0.09992, "grad_norm": 2.359375, "grad_norm_var": 0.03874409993489583, "learning_rate": 0.0001, "loss": 4.7245, "loss/crossentropy": 1.9446094632148743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24112944304943085, "step": 4996 }, { "epoch": 0.09996, "grad_norm": 2.234375, "grad_norm_var": 0.04269205729166667, "learning_rate": 0.0001, "loss": 4.7063, "loss/crossentropy": 2.585115671157837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26535044610500336, "step": 4998 }, { "epoch": 0.1, "grad_norm": 2.8125, "grad_norm_var": 1.14400634765625, "learning_rate": 0.0001, "loss": 4.6848, "loss/crossentropy": 1.9871427416801453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24747569859027863, "step": 5000 }, { "epoch": 0.10004, "grad_norm": 2.359375, "grad_norm_var": 1.1424967447916667, "learning_rate": 0.0001, "loss": 4.6058, "loss/crossentropy": 1.8981972336769104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24131165444850922, "step": 5002 }, { "epoch": 0.10008, "grad_norm": 2.28125, "grad_norm_var": 1.1522786458333334, "learning_rate": 0.0001, "loss": 4.5731, "loss/crossentropy": 2.323825240135193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2584778293967247, "step": 5004 }, { "epoch": 0.10012, "grad_norm": 2.390625, "grad_norm_var": 1.1363118489583333, "learning_rate": 0.0001, "loss": 4.7065, "loss/crossentropy": 1.728028118610382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2997850477695465, "step": 5006 }, { "epoch": 0.10016, "grad_norm": 2.28125, "grad_norm_var": 1.12437744140625, "learning_rate": 0.0001, "loss": 4.8001, "loss/crossentropy": 2.1486289501190186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25335805118083954, "step": 5008 }, { "epoch": 0.1002, "grad_norm": 2.5, "grad_norm_var": 1.1099761962890624, "learning_rate": 0.0001, "loss": 4.936, "loss/crossentropy": 2.3132145404815674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26033517718315125, "step": 5010 }, { "epoch": 0.10024, "grad_norm": 2.671875, "grad_norm_var": 1.1134928385416667, "learning_rate": 0.0001, "loss": 4.493, "loss/crossentropy": 1.9233656525611877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2471245899796486, "step": 5012 }, { "epoch": 0.10028, "grad_norm": 2.21875, "grad_norm_var": 1.1301747639973958, "learning_rate": 0.0001, "loss": 4.5221, "loss/crossentropy": 1.9435511827468872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24730819463729858, "step": 5014 }, { "epoch": 0.10032, "grad_norm": 2.1875, "grad_norm_var": 0.018928019205729167, "learning_rate": 0.0001, "loss": 4.3895, "loss/crossentropy": 2.2031294107437134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23213820159435272, "step": 5016 }, { "epoch": 0.10036, "grad_norm": 2.703125, "grad_norm_var": 0.0276763916015625, "learning_rate": 0.0001, "loss": 4.7247, "loss/crossentropy": 2.285850405693054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25809091329574585, "step": 5018 }, { "epoch": 0.1004, "grad_norm": 2.328125, "grad_norm_var": 0.025211588541666666, "learning_rate": 0.0001, "loss": 4.7697, "loss/crossentropy": 1.8660435676574707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2436714917421341, "step": 5020 }, { "epoch": 0.10044, "grad_norm": 2.421875, "grad_norm_var": 0.025877888997395834, "learning_rate": 0.0001, "loss": 4.3716, "loss/crossentropy": 2.0659420490264893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24248096346855164, "step": 5022 }, { "epoch": 0.10048, "grad_norm": 2.328125, "grad_norm_var": 0.030516560872395834, "learning_rate": 0.0001, "loss": 4.7093, "loss/crossentropy": 2.213133215904236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2423061951994896, "step": 5024 }, { "epoch": 0.10052, "grad_norm": 2.234375, "grad_norm_var": 0.0313385009765625, "learning_rate": 0.0001, "loss": 4.5288, "loss/crossentropy": 2.3052343130111694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24757324904203415, "step": 5026 }, { "epoch": 0.10056, "grad_norm": 2.796875, "grad_norm_var": 0.039094034830729166, "learning_rate": 0.0001, "loss": 4.8709, "loss/crossentropy": 2.226990580558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25892695784568787, "step": 5028 }, { "epoch": 0.1006, "grad_norm": 2.71875, "grad_norm_var": 0.04163004557291667, "learning_rate": 0.0001, "loss": 4.9444, "loss/crossentropy": 2.3460742235183716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27760128676891327, "step": 5030 }, { "epoch": 0.10064, "grad_norm": 2.125, "grad_norm_var": 0.04345703125, "learning_rate": 0.0001, "loss": 4.3597, "loss/crossentropy": 1.9782095551490784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349972277879715, "step": 5032 }, { "epoch": 0.10068, "grad_norm": 2.96875, "grad_norm_var": 0.06148681640625, "learning_rate": 0.0001, "loss": 4.5602, "loss/crossentropy": 1.847929298877716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22401423752307892, "step": 5034 }, { "epoch": 0.10072, "grad_norm": 2.46875, "grad_norm_var": 0.06073811848958333, "learning_rate": 0.0001, "loss": 4.4989, "loss/crossentropy": 2.172071158885956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2656550332903862, "step": 5036 }, { "epoch": 0.10076, "grad_norm": 2.234375, "grad_norm_var": 0.07026265462239584, "learning_rate": 0.0001, "loss": 4.3892, "loss/crossentropy": 2.3497499227523804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26085225492715836, "step": 5038 }, { "epoch": 0.1008, "grad_norm": 2.328125, "grad_norm_var": 0.07284749348958333, "learning_rate": 0.0001, "loss": 4.2583, "loss/crossentropy": 2.0916348695755005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21129868924617767, "step": 5040 }, { "epoch": 0.10084, "grad_norm": 2.59375, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 4.8931, "loss/crossentropy": 2.243077278137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2657035142183304, "step": 5042 }, { "epoch": 0.10088, "grad_norm": 10.0, "grad_norm_var": 3.600194295247396, "learning_rate": 0.0001, "loss": 4.8887, "loss/crossentropy": 1.9361066222190857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.41255413740873337, "step": 5044 }, { "epoch": 0.10092, "grad_norm": 3.703125, "grad_norm_var": 3.6162760416666666, "learning_rate": 0.0001, "loss": 4.7632, "loss/crossentropy": 2.0139313340187073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2591235190629959, "step": 5046 }, { "epoch": 0.10096, "grad_norm": 2.203125, "grad_norm_var": 3.589704386393229, "learning_rate": 0.0001, "loss": 4.6642, "loss/crossentropy": 2.335710287094116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26457205414772034, "step": 5048 }, { "epoch": 0.101, "grad_norm": 2.34375, "grad_norm_var": 3.604325358072917, "learning_rate": 0.0001, "loss": 4.3329, "loss/crossentropy": 1.9269848465919495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23867928236722946, "step": 5050 }, { "epoch": 0.10104, "grad_norm": 2.34375, "grad_norm_var": 3.627415974934896, "learning_rate": 0.0001, "loss": 4.4982, "loss/crossentropy": 1.9732608795166016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24961821734905243, "step": 5052 }, { "epoch": 0.10108, "grad_norm": 2.359375, "grad_norm_var": 3.6287506103515623, "learning_rate": 0.0001, "loss": 4.7218, "loss/crossentropy": 2.2659696340560913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24580512940883636, "step": 5054 }, { "epoch": 0.10112, "grad_norm": 2.84375, "grad_norm_var": 3.57880859375, "learning_rate": 0.0001, "loss": 4.4318, "loss/crossentropy": 1.6600720882415771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20283473283052444, "step": 5056 }, { "epoch": 0.10116, "grad_norm": 2.375, "grad_norm_var": 3.602855428059896, "learning_rate": 0.0001, "loss": 4.8249, "loss/crossentropy": 2.0175185799598694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23329314589500427, "step": 5058 }, { "epoch": 0.1012, "grad_norm": 2.4375, "grad_norm_var": 0.146484375, "learning_rate": 0.0001, "loss": 4.5132, "loss/crossentropy": 2.259668231010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692077234387398, "step": 5060 }, { "epoch": 0.10124, "grad_norm": 2.46875, "grad_norm_var": 0.02144775390625, "learning_rate": 0.0001, "loss": 4.6392, "loss/crossentropy": 2.260953903198242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28225430846214294, "step": 5062 }, { "epoch": 0.10128, "grad_norm": 2.171875, "grad_norm_var": 0.021686808268229166, "learning_rate": 0.0001, "loss": 4.3624, "loss/crossentropy": 1.9721892476081848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2336483597755432, "step": 5064 }, { "epoch": 0.10132, "grad_norm": 2.40625, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 4.6257, "loss/crossentropy": 2.188947319984436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23959602415561676, "step": 5066 }, { "epoch": 0.10136, "grad_norm": 3.078125, "grad_norm_var": 0.04843343098958333, "learning_rate": 0.0001, "loss": 4.5601, "loss/crossentropy": 1.7914190292358398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2234276980161667, "step": 5068 }, { "epoch": 0.1014, "grad_norm": 2.265625, "grad_norm_var": 0.049925740559895834, "learning_rate": 0.0001, "loss": 4.5806, "loss/crossentropy": 2.1124663949012756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22523616254329681, "step": 5070 }, { "epoch": 0.10144, "grad_norm": 2.359375, "grad_norm_var": 0.03764546712239583, "learning_rate": 0.0001, "loss": 4.7098, "loss/crossentropy": 2.1146361231803894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22483345866203308, "step": 5072 }, { "epoch": 0.10148, "grad_norm": 2.171875, "grad_norm_var": 0.04121805826822917, "learning_rate": 0.0001, "loss": 4.5076, "loss/crossentropy": 2.335755705833435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.256914846599102, "step": 5074 }, { "epoch": 0.10152, "grad_norm": 2.34375, "grad_norm_var": 0.04346415201822917, "learning_rate": 0.0001, "loss": 4.8665, "loss/crossentropy": 2.2911819219589233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26038119196891785, "step": 5076 }, { "epoch": 0.10156, "grad_norm": 2.171875, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 4.592, "loss/crossentropy": 2.059769034385681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24004006385803223, "step": 5078 }, { "epoch": 0.1016, "grad_norm": 2.109375, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 4.5028, "loss/crossentropy": 2.141201138496399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22454539686441422, "step": 5080 }, { "epoch": 0.10164, "grad_norm": 2.421875, "grad_norm_var": 0.050146484375, "learning_rate": 0.0001, "loss": 4.7527, "loss/crossentropy": 1.9538633823394775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24477297067642212, "step": 5082 }, { "epoch": 0.10168, "grad_norm": 2.1875, "grad_norm_var": 0.00992431640625, "learning_rate": 0.0001, "loss": 4.4799, "loss/crossentropy": 2.1555078625679016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2425938919186592, "step": 5084 }, { "epoch": 0.10172, "grad_norm": 2.296875, "grad_norm_var": 0.009235636393229166, "learning_rate": 0.0001, "loss": 4.5583, "loss/crossentropy": 2.2306214570999146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445632517337799, "step": 5086 }, { "epoch": 0.10176, "grad_norm": 2.03125, "grad_norm_var": 0.010179646809895833, "learning_rate": 0.0001, "loss": 4.1075, "loss/crossentropy": 1.9713392853736877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22239256650209427, "step": 5088 }, { "epoch": 0.1018, "grad_norm": 2.234375, "grad_norm_var": 0.011799112955729166, "learning_rate": 0.0001, "loss": 4.5181, "loss/crossentropy": 1.951128602027893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24310563504695892, "step": 5090 }, { "epoch": 0.10184, "grad_norm": 2.28125, "grad_norm_var": 0.011051432291666666, "learning_rate": 0.0001, "loss": 4.3617, "loss/crossentropy": 2.0100057125091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23054375499486923, "step": 5092 }, { "epoch": 0.10188, "grad_norm": 2.140625, "grad_norm_var": 0.011042277018229166, "learning_rate": 0.0001, "loss": 4.4573, "loss/crossentropy": 2.1898789405822754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24956130981445312, "step": 5094 }, { "epoch": 0.10192, "grad_norm": 2.21875, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 4.6576, "loss/crossentropy": 1.5666239857673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20010025054216385, "step": 5096 }, { "epoch": 0.10196, "grad_norm": 2.296875, "grad_norm_var": 0.022468058268229167, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 1.884951651096344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2484818547964096, "step": 5098 }, { "epoch": 0.102, "grad_norm": 2.34375, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 4.5207, "loss/crossentropy": 1.976080298423767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23380043357610703, "step": 5100 }, { "epoch": 0.10204, "grad_norm": 2.375, "grad_norm_var": 0.0229156494140625, "learning_rate": 0.0001, "loss": 4.592, "loss/crossentropy": 2.1262658834457397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2441401332616806, "step": 5102 }, { "epoch": 0.10208, "grad_norm": 2.515625, "grad_norm_var": 0.08055013020833333, "learning_rate": 0.0001, "loss": 4.2735, "loss/crossentropy": 1.7588757276535034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101762592792511, "step": 5104 }, { "epoch": 0.10212, "grad_norm": 2.578125, "grad_norm_var": 0.08059488932291667, "learning_rate": 0.0001, "loss": 4.6721, "loss/crossentropy": 2.292783260345459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26928654313087463, "step": 5106 }, { "epoch": 0.10216, "grad_norm": 2.65625, "grad_norm_var": 0.4834218343098958, "learning_rate": 0.0001, "loss": 4.7475, "loss/crossentropy": 2.0670888423919678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2644767463207245, "step": 5108 }, { "epoch": 0.1022, "grad_norm": 2.1875, "grad_norm_var": 0.4729563395182292, "learning_rate": 0.0001, "loss": 4.4113, "loss/crossentropy": 2.0522598028182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2410094290971756, "step": 5110 }, { "epoch": 0.10224, "grad_norm": 2.359375, "grad_norm_var": 0.4729075113932292, "learning_rate": 0.0001, "loss": 4.4914, "loss/crossentropy": 2.0756974816322327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27077721059322357, "step": 5112 }, { "epoch": 0.10228, "grad_norm": 2.3125, "grad_norm_var": 0.47226155598958336, "learning_rate": 0.0001, "loss": 4.6524, "loss/crossentropy": 2.1569767594337463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23582034558057785, "step": 5114 }, { "epoch": 0.10232, "grad_norm": 2.21875, "grad_norm_var": 0.4847819010416667, "learning_rate": 0.0001, "loss": 4.2821, "loss/crossentropy": 1.9736077785491943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23984025418758392, "step": 5116 }, { "epoch": 0.10236, "grad_norm": 2.3125, "grad_norm_var": 0.4942698160807292, "learning_rate": 0.0001, "loss": 4.3047, "loss/crossentropy": 2.1400066614151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2347380667924881, "step": 5118 }, { "epoch": 0.1024, "grad_norm": 2.171875, "grad_norm_var": 0.46923726399739585, "learning_rate": 0.0001, "loss": 4.3531, "loss/crossentropy": 1.989999234676361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22508147358894348, "step": 5120 }, { "epoch": 0.10244, "grad_norm": 2.5, "grad_norm_var": 0.4704498291015625, "learning_rate": 0.0001, "loss": 4.8817, "loss/crossentropy": 2.132390856742859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3074522316455841, "step": 5122 }, { "epoch": 0.10248, "grad_norm": 2.296875, "grad_norm_var": 0.011002604166666667, "learning_rate": 0.0001, "loss": 4.3606, "loss/crossentropy": 1.7906856536865234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126675397157669, "step": 5124 }, { "epoch": 0.10252, "grad_norm": 2.265625, "grad_norm_var": 0.0112457275390625, "learning_rate": 0.0001, "loss": 4.7045, "loss/crossentropy": 2.0576369762420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27211636304855347, "step": 5126 }, { "epoch": 0.10256, "grad_norm": 2.265625, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 4.5402, "loss/crossentropy": 2.1174184679985046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2344469577074051, "step": 5128 }, { "epoch": 0.1026, "grad_norm": 2.28125, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 4.7227, "loss/crossentropy": 1.9139717817306519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362786829471588, "step": 5130 }, { "epoch": 0.10264, "grad_norm": 2.3125, "grad_norm_var": 0.008495076497395834, "learning_rate": 0.0001, "loss": 4.4801, "loss/crossentropy": 2.022357940673828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2307513877749443, "step": 5132 }, { "epoch": 0.10268, "grad_norm": 2.203125, "grad_norm_var": 0.007721964518229167, "learning_rate": 0.0001, "loss": 4.3963, "loss/crossentropy": 2.038477897644043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22871223092079163, "step": 5134 }, { "epoch": 0.10272, "grad_norm": 2.421875, "grad_norm_var": 0.007079060872395833, "learning_rate": 0.0001, "loss": 4.7283, "loss/crossentropy": 2.0895442962646484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23719681799411774, "step": 5136 }, { "epoch": 0.10276, "grad_norm": 2.5625, "grad_norm_var": 0.008820597330729167, "learning_rate": 0.0001, "loss": 4.7059, "loss/crossentropy": 2.17978835105896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519204765558243, "step": 5138 }, { "epoch": 0.1028, "grad_norm": 2.46875, "grad_norm_var": 0.00953369140625, "learning_rate": 0.0001, "loss": 4.7318, "loss/crossentropy": 2.19089937210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24376338720321655, "step": 5140 }, { "epoch": 0.10284, "grad_norm": 2.515625, "grad_norm_var": 0.011002604166666667, "learning_rate": 0.0001, "loss": 4.3961, "loss/crossentropy": 2.018259823322296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23707401752471924, "step": 5142 }, { "epoch": 0.10288, "grad_norm": 2.484375, "grad_norm_var": 0.012287394205729166, "learning_rate": 0.0001, "loss": 4.7556, "loss/crossentropy": 2.110401153564453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23409561812877655, "step": 5144 }, { "epoch": 0.10292, "grad_norm": 2.390625, "grad_norm_var": 0.012239583333333333, "learning_rate": 0.0001, "loss": 4.615, "loss/crossentropy": 2.2096832990646362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2580249160528183, "step": 5146 }, { "epoch": 0.10296, "grad_norm": 2.09375, "grad_norm_var": 0.015803019205729168, "learning_rate": 0.0001, "loss": 4.5218, "loss/crossentropy": 2.1825822591781616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23592937737703323, "step": 5148 }, { "epoch": 0.103, "grad_norm": 2.359375, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 2.2440234422683716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23741164803504944, "step": 5150 }, { "epoch": 0.10304, "grad_norm": 2.4375, "grad_norm_var": 0.015071614583333334, "learning_rate": 0.0001, "loss": 4.962, "loss/crossentropy": 2.3818799257278442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2642097622156143, "step": 5152 }, { "epoch": 0.10308, "grad_norm": 2.296875, "grad_norm_var": 0.014378865559895834, "learning_rate": 0.0001, "loss": 4.5706, "loss/crossentropy": 2.1905806064605713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26945509016513824, "step": 5154 }, { "epoch": 0.10312, "grad_norm": 2.140625, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 4.5678, "loss/crossentropy": 2.096913695335388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2540033459663391, "step": 5156 }, { "epoch": 0.10316, "grad_norm": 2.375, "grad_norm_var": 0.015721638997395832, "learning_rate": 0.0001, "loss": 4.5286, "loss/crossentropy": 1.7916489243507385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22504088282585144, "step": 5158 }, { "epoch": 0.1032, "grad_norm": 2.34375, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 4.6256, "loss/crossentropy": 1.9366079568862915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25474467873573303, "step": 5160 }, { "epoch": 0.10324, "grad_norm": 2.40625, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 4.5054, "loss/crossentropy": 2.0233771800994873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23394355177879333, "step": 5162 }, { "epoch": 0.10328, "grad_norm": 2.234375, "grad_norm_var": 0.010693359375, "learning_rate": 0.0001, "loss": 4.7803, "loss/crossentropy": 2.442312717437744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28390438854694366, "step": 5164 }, { "epoch": 0.10332, "grad_norm": 2.3125, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 4.7849, "loss/crossentropy": 1.9547526836395264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2298036813735962, "step": 5166 }, { "epoch": 0.10336, "grad_norm": 2.25, "grad_norm_var": 0.018766276041666665, "learning_rate": 0.0001, "loss": 4.3225, "loss/crossentropy": 1.7974739074707031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959447860717773, "step": 5168 }, { "epoch": 0.1034, "grad_norm": 2.328125, "grad_norm_var": 0.020075480143229168, "learning_rate": 0.0001, "loss": 4.5438, "loss/crossentropy": 1.9860564470291138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24661653488874435, "step": 5170 }, { "epoch": 0.10344, "grad_norm": 2.34375, "grad_norm_var": 0.019050089518229167, "learning_rate": 0.0001, "loss": 4.5084, "loss/crossentropy": 1.6198940873146057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20592768490314484, "step": 5172 }, { "epoch": 0.10348, "grad_norm": 2.25, "grad_norm_var": 0.019554646809895833, "learning_rate": 0.0001, "loss": 4.4831, "loss/crossentropy": 2.193474531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24437790364027023, "step": 5174 }, { "epoch": 0.10352, "grad_norm": 2.5, "grad_norm_var": 0.022557576497395832, "learning_rate": 0.0001, "loss": 4.9209, "loss/crossentropy": 2.221992254257202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2931511402130127, "step": 5176 }, { "epoch": 0.10356, "grad_norm": 2.28125, "grad_norm_var": 0.022272745768229168, "learning_rate": 0.0001, "loss": 4.4611, "loss/crossentropy": 2.006419837474823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335921749472618, "step": 5178 }, { "epoch": 0.1036, "grad_norm": 2.171875, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 4.4477, "loss/crossentropy": 2.2861804962158203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2549041658639908, "step": 5180 }, { "epoch": 0.10364, "grad_norm": 2.171875, "grad_norm_var": 0.014378865559895834, "learning_rate": 0.0001, "loss": 4.4079, "loss/crossentropy": 2.217998743057251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24741299450397491, "step": 5182 }, { "epoch": 0.10368, "grad_norm": 2.328125, "grad_norm_var": 0.0123931884765625, "learning_rate": 0.0001, "loss": 4.7344, "loss/crossentropy": 2.1875526905059814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23636415600776672, "step": 5184 }, { "epoch": 0.10372, "grad_norm": 2.078125, "grad_norm_var": 0.014850870768229166, "learning_rate": 0.0001, "loss": 4.4629, "loss/crossentropy": 1.8408135175704956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23278112709522247, "step": 5186 }, { "epoch": 0.10376, "grad_norm": 2.359375, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.6848, "loss/crossentropy": 1.7936646342277527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23859456181526184, "step": 5188 }, { "epoch": 0.1038, "grad_norm": 2.21875, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 4.3825, "loss/crossentropy": 2.0800318717956543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23155340552330017, "step": 5190 }, { "epoch": 0.10384, "grad_norm": 2.265625, "grad_norm_var": 0.0066802978515625, "learning_rate": 0.0001, "loss": 4.9645, "loss/crossentropy": 2.277778387069702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25029345601797104, "step": 5192 }, { "epoch": 0.10388, "grad_norm": 2.09375, "grad_norm_var": 0.0081695556640625, "learning_rate": 0.0001, "loss": 4.2486, "loss/crossentropy": 1.9658478498458862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22655482590198517, "step": 5194 }, { "epoch": 0.10392, "grad_norm": 2.21875, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.7503, "loss/crossentropy": 2.214509129524231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.254493810236454, "step": 5196 }, { "epoch": 0.10396, "grad_norm": 2.21875, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 4.3648, "loss/crossentropy": 1.9465742707252502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23288530111312866, "step": 5198 }, { "epoch": 0.104, "grad_norm": 2.21875, "grad_norm_var": 0.010277303059895833, "learning_rate": 0.0001, "loss": 4.3216, "loss/crossentropy": 2.062779188156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21423730999231339, "step": 5200 }, { "epoch": 0.10404, "grad_norm": 2.390625, "grad_norm_var": 0.010350545247395834, "learning_rate": 0.0001, "loss": 4.1645, "loss/crossentropy": 1.777470588684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22998665273189545, "step": 5202 }, { "epoch": 0.10408, "grad_norm": 2.46875, "grad_norm_var": 0.05396728515625, "learning_rate": 0.0001, "loss": 4.8979, "loss/crossentropy": 2.2505980730056763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30352361500263214, "step": 5204 }, { "epoch": 0.10412, "grad_norm": 3.359375, "grad_norm_var": 0.12148335774739584, "learning_rate": 0.0001, "loss": 4.4093, "loss/crossentropy": 1.8989517092704773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2210940569639206, "step": 5206 }, { "epoch": 0.10416, "grad_norm": 2.734375, "grad_norm_var": 0.12923177083333334, "learning_rate": 0.0001, "loss": 4.5535, "loss/crossentropy": 2.378798723220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25575730204582214, "step": 5208 }, { "epoch": 0.1042, "grad_norm": 2.34375, "grad_norm_var": 0.11901041666666666, "learning_rate": 0.0001, "loss": 4.6763, "loss/crossentropy": 1.7642006278038025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22105304896831512, "step": 5210 }, { "epoch": 0.10424, "grad_norm": 2.375, "grad_norm_var": 0.11607666015625, "learning_rate": 0.0001, "loss": 4.8945, "loss/crossentropy": 2.188746988773346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24192160367965698, "step": 5212 }, { "epoch": 0.10428, "grad_norm": 2.3125, "grad_norm_var": 0.11298421223958334, "learning_rate": 0.0001, "loss": 4.6001, "loss/crossentropy": 2.116630494594574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2879187613725662, "step": 5214 }, { "epoch": 0.10432, "grad_norm": 2.15625, "grad_norm_var": 0.11013895670572917, "learning_rate": 0.0001, "loss": 4.4932, "loss/crossentropy": 1.8329599499702454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21425092220306396, "step": 5216 }, { "epoch": 0.10436, "grad_norm": 2.296875, "grad_norm_var": 0.1064849853515625, "learning_rate": 0.0001, "loss": 4.3966, "loss/crossentropy": 2.1063259840011597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23704984784126282, "step": 5218 }, { "epoch": 0.1044, "grad_norm": 2.25, "grad_norm_var": 0.08238525390625, "learning_rate": 0.0001, "loss": 4.4821, "loss/crossentropy": 1.7994996309280396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21408653259277344, "step": 5220 }, { "epoch": 0.10444, "grad_norm": 2.65625, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 4.2437, "loss/crossentropy": 1.880006492137909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24362730979919434, "step": 5222 }, { "epoch": 0.10448, "grad_norm": 2.25, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 4.2726, "loss/crossentropy": 2.010268449783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22580985724925995, "step": 5224 }, { "epoch": 0.10452, "grad_norm": 2.3125, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 4.6839, "loss/crossentropy": 2.057162046432495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23878254741430283, "step": 5226 }, { "epoch": 0.10456, "grad_norm": 2.21875, "grad_norm_var": 0.019603474934895834, "learning_rate": 0.0001, "loss": 4.3774, "loss/crossentropy": 2.0615930557250977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22890077531337738, "step": 5228 }, { "epoch": 0.1046, "grad_norm": 2.40625, "grad_norm_var": 0.020817057291666666, "learning_rate": 0.0001, "loss": 4.7693, "loss/crossentropy": 1.9013472199440002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22539222240447998, "step": 5230 }, { "epoch": 0.10464, "grad_norm": 2.296875, "grad_norm_var": 0.019482421875, "learning_rate": 0.0001, "loss": 4.5531, "loss/crossentropy": 2.239185929298401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24706681817770004, "step": 5232 }, { "epoch": 0.10468, "grad_norm": 2.234375, "grad_norm_var": 0.020540364583333335, "learning_rate": 0.0001, "loss": 4.3902, "loss/crossentropy": 1.9881523251533508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24434638023376465, "step": 5234 }, { "epoch": 0.10472, "grad_norm": 2.3125, "grad_norm_var": 0.020524088541666666, "learning_rate": 0.0001, "loss": 4.7908, "loss/crossentropy": 1.9529212713241577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2321469634771347, "step": 5236 }, { "epoch": 0.10476, "grad_norm": 2.40625, "grad_norm_var": 0.01041259765625, "learning_rate": 0.0001, "loss": 4.8045, "loss/crossentropy": 2.196424722671509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24054434895515442, "step": 5238 }, { "epoch": 0.1048, "grad_norm": 2.234375, "grad_norm_var": 0.009781901041666667, "learning_rate": 0.0001, "loss": 4.5895, "loss/crossentropy": 2.082987070083618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2760557308793068, "step": 5240 }, { "epoch": 0.10484, "grad_norm": 2.359375, "grad_norm_var": 0.011652628580729166, "learning_rate": 0.0001, "loss": 4.5961, "loss/crossentropy": 2.2369720935821533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25305214524269104, "step": 5242 }, { "epoch": 0.10488, "grad_norm": 2.1875, "grad_norm_var": 0.0119537353515625, "learning_rate": 0.0001, "loss": 4.7268, "loss/crossentropy": 2.3372031450271606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26765232533216476, "step": 5244 }, { "epoch": 0.10492, "grad_norm": 2.3125, "grad_norm_var": 0.0113677978515625, "learning_rate": 0.0001, "loss": 4.6148, "loss/crossentropy": 2.23944628238678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26508912444114685, "step": 5246 }, { "epoch": 0.10496, "grad_norm": 2.21875, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 4.3453, "loss/crossentropy": 2.2701858282089233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551077604293823, "step": 5248 }, { "epoch": 0.105, "grad_norm": 2.234375, "grad_norm_var": 0.010887654622395833, "learning_rate": 0.0001, "loss": 4.6455, "loss/crossentropy": 2.293464779853821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25016437470912933, "step": 5250 }, { "epoch": 0.10504, "grad_norm": 2.46875, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 4.5798, "loss/crossentropy": 2.3072171211242676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27824972569942474, "step": 5252 }, { "epoch": 0.10508, "grad_norm": 2.453125, "grad_norm_var": 0.010498046875, "learning_rate": 0.0001, "loss": 4.5948, "loss/crossentropy": 1.9855756759643555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312464788556099, "step": 5254 }, { "epoch": 0.10512, "grad_norm": 2.3125, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 4.8104, "loss/crossentropy": 1.9584077596664429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2386016771197319, "step": 5256 }, { "epoch": 0.10516, "grad_norm": 2.265625, "grad_norm_var": 0.010741170247395833, "learning_rate": 0.0001, "loss": 4.6103, "loss/crossentropy": 2.2184669375419617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25914129614830017, "step": 5258 }, { "epoch": 0.1052, "grad_norm": 2.1875, "grad_norm_var": 0.0122222900390625, "learning_rate": 0.0001, "loss": 4.1221, "loss/crossentropy": 1.6798554062843323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037271112203598, "step": 5260 }, { "epoch": 0.10524, "grad_norm": 2.203125, "grad_norm_var": 0.011995442708333333, "learning_rate": 0.0001, "loss": 4.2718, "loss/crossentropy": 1.8675006031990051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21148693561553955, "step": 5262 }, { "epoch": 0.10528, "grad_norm": 2.390625, "grad_norm_var": 0.012336222330729167, "learning_rate": 0.0001, "loss": 4.5136, "loss/crossentropy": 1.9439855813980103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24611759185791016, "step": 5264 }, { "epoch": 0.10532, "grad_norm": 2.171875, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.5298, "loss/crossentropy": 2.1550748348236084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2428945228457451, "step": 5266 }, { "epoch": 0.10536, "grad_norm": 2.34375, "grad_norm_var": 0.0117584228515625, "learning_rate": 0.0001, "loss": 4.6777, "loss/crossentropy": 1.9924054741859436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21866007149219513, "step": 5268 }, { "epoch": 0.1054, "grad_norm": 2.40625, "grad_norm_var": 0.011571248372395834, "learning_rate": 0.0001, "loss": 4.5359, "loss/crossentropy": 2.0413920879364014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2579498365521431, "step": 5270 }, { "epoch": 0.10544, "grad_norm": 2.578125, "grad_norm_var": 0.016380818684895833, "learning_rate": 0.0001, "loss": 4.6406, "loss/crossentropy": 2.062632381916046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24655035883188248, "step": 5272 }, { "epoch": 0.10548, "grad_norm": 2.421875, "grad_norm_var": 0.021833292643229165, "learning_rate": 0.0001, "loss": 4.8133, "loss/crossentropy": 2.0620386600494385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3041655272245407, "step": 5274 }, { "epoch": 0.10552, "grad_norm": 2.234375, "grad_norm_var": 0.016657511393229168, "learning_rate": 0.0001, "loss": 4.4775, "loss/crossentropy": 1.966421365737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22333864122629166, "step": 5276 }, { "epoch": 0.10556, "grad_norm": 2.25, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 4.4323, "loss/crossentropy": 1.9120238423347473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22857370972633362, "step": 5278 }, { "epoch": 0.1056, "grad_norm": 2.40625, "grad_norm_var": 0.019701131184895835, "learning_rate": 0.0001, "loss": 4.5179, "loss/crossentropy": 2.084389805793762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2556762397289276, "step": 5280 }, { "epoch": 0.10564, "grad_norm": 2.328125, "grad_norm_var": 0.019758097330729165, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 1.8707188367843628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22955116629600525, "step": 5282 }, { "epoch": 0.10568, "grad_norm": 2.265625, "grad_norm_var": 0.019775390625, "learning_rate": 0.0001, "loss": 4.3538, "loss/crossentropy": 1.8243692517280579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088172495365143, "step": 5284 }, { "epoch": 0.10572, "grad_norm": 2.359375, "grad_norm_var": 0.0192535400390625, "learning_rate": 0.0001, "loss": 4.5046, "loss/crossentropy": 2.111305356025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595943957567215, "step": 5286 }, { "epoch": 0.10576, "grad_norm": 2.15625, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0001, "loss": 4.4598, "loss/crossentropy": 2.3729283809661865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2632211595773697, "step": 5288 }, { "epoch": 0.1058, "grad_norm": 2.140625, "grad_norm_var": 0.007249959309895833, "learning_rate": 0.0001, "loss": 4.6256, "loss/crossentropy": 2.3542696237564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25645585358142853, "step": 5290 }, { "epoch": 0.10584, "grad_norm": 2.4375, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.4863, "loss/crossentropy": 2.0140068531036377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23033145815134048, "step": 5292 }, { "epoch": 0.10588, "grad_norm": 2.375, "grad_norm_var": 0.0099761962890625, "learning_rate": 0.0001, "loss": 4.8126, "loss/crossentropy": 1.9499077796936035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21458172798156738, "step": 5294 }, { "epoch": 0.10592, "grad_norm": 2.25, "grad_norm_var": 0.0084625244140625, "learning_rate": 0.0001, "loss": 4.479, "loss/crossentropy": 2.1434344053268433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2552379444241524, "step": 5296 }, { "epoch": 0.10596, "grad_norm": 2.359375, "grad_norm_var": 0.007819620768229167, "learning_rate": 0.0001, "loss": 4.7149, "loss/crossentropy": 1.9951340556144714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21683495491743088, "step": 5298 }, { "epoch": 0.106, "grad_norm": 2.5625, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 4.7855, "loss/crossentropy": 2.380235195159912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2756577730178833, "step": 5300 }, { "epoch": 0.10604, "grad_norm": 2.3125, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 4.8596, "loss/crossentropy": 2.298241972923279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2598598450422287, "step": 5302 }, { "epoch": 0.10608, "grad_norm": 2.5, "grad_norm_var": 0.032136027018229166, "learning_rate": 0.0001, "loss": 4.8615, "loss/crossentropy": 2.093233823776245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23809552192687988, "step": 5304 }, { "epoch": 0.10612, "grad_norm": 2.15625, "grad_norm_var": 0.031493123372395834, "learning_rate": 0.0001, "loss": 4.5271, "loss/crossentropy": 2.0901564955711365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24874016642570496, "step": 5306 }, { "epoch": 0.10616, "grad_norm": 2.453125, "grad_norm_var": 0.0304107666015625, "learning_rate": 0.0001, "loss": 4.3193, "loss/crossentropy": 1.8029736280441284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20973137766122818, "step": 5308 }, { "epoch": 0.1062, "grad_norm": 2.25, "grad_norm_var": 0.03316650390625, "learning_rate": 0.0001, "loss": 4.4255, "loss/crossentropy": 2.4068437814712524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25310443341732025, "step": 5310 }, { "epoch": 0.10624, "grad_norm": 2.265625, "grad_norm_var": 0.031029256184895833, "learning_rate": 0.0001, "loss": 4.4499, "loss/crossentropy": 2.1125503182411194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2455870360136032, "step": 5312 }, { "epoch": 0.10628, "grad_norm": 2.234375, "grad_norm_var": 0.0323150634765625, "learning_rate": 0.0001, "loss": 4.4752, "loss/crossentropy": 2.0995737314224243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2418016716837883, "step": 5314 }, { "epoch": 0.10632, "grad_norm": 2.234375, "grad_norm_var": 0.0212890625, "learning_rate": 0.0001, "loss": 4.5873, "loss/crossentropy": 1.8753212690353394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24125799536705017, "step": 5316 }, { "epoch": 0.10636, "grad_norm": 2.390625, "grad_norm_var": 0.021451822916666665, "learning_rate": 0.0001, "loss": 4.7269, "loss/crossentropy": 2.0175408720970154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23516137897968292, "step": 5318 }, { "epoch": 0.1064, "grad_norm": 2.109375, "grad_norm_var": 0.00914306640625, "learning_rate": 0.0001, "loss": 4.4953, "loss/crossentropy": 2.2671592235565186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356845736503601, "step": 5320 }, { "epoch": 0.10644, "grad_norm": 2.140625, "grad_norm_var": 0.009468587239583333, "learning_rate": 0.0001, "loss": 4.5328, "loss/crossentropy": 2.142452359199524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26422591507434845, "step": 5322 }, { "epoch": 0.10648, "grad_norm": 2.3125, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 4.4328, "loss/crossentropy": 1.9664896726608276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25856246054172516, "step": 5324 }, { "epoch": 0.10652, "grad_norm": 2.203125, "grad_norm_var": 0.007222493489583333, "learning_rate": 0.0001, "loss": 4.5531, "loss/crossentropy": 2.168110191822052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23274105042219162, "step": 5326 }, { "epoch": 0.10656, "grad_norm": 2.21875, "grad_norm_var": 0.00552978515625, "learning_rate": 0.0001, "loss": 4.5242, "loss/crossentropy": 2.006514251232147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532104551792145, "step": 5328 }, { "epoch": 0.1066, "grad_norm": 2.296875, "grad_norm_var": 0.0070220947265625, "learning_rate": 0.0001, "loss": 4.5593, "loss/crossentropy": 2.462701439857483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2771739661693573, "step": 5330 }, { "epoch": 0.10664, "grad_norm": 2.203125, "grad_norm_var": 0.0076324462890625, "learning_rate": 0.0001, "loss": 4.4076, "loss/crossentropy": 2.0889209508895874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479858472943306, "step": 5332 }, { "epoch": 0.10668, "grad_norm": 2.25, "grad_norm_var": 0.008356730143229166, "learning_rate": 0.0001, "loss": 4.329, "loss/crossentropy": 1.8056100606918335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22194840013980865, "step": 5334 }, { "epoch": 0.10672, "grad_norm": 2.171875, "grad_norm_var": 0.0075185139973958336, "learning_rate": 0.0001, "loss": 4.703, "loss/crossentropy": 2.32460880279541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28185322880744934, "step": 5336 }, { "epoch": 0.10676, "grad_norm": 2.140625, "grad_norm_var": 0.0075185139973958336, "learning_rate": 0.0001, "loss": 4.6388, "loss/crossentropy": 2.238978862762451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24314726889133453, "step": 5338 }, { "epoch": 0.1068, "grad_norm": 2.0625, "grad_norm_var": 0.00914306640625, "learning_rate": 0.0001, "loss": 4.4161, "loss/crossentropy": 1.8914734721183777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20767460763454437, "step": 5340 }, { "epoch": 0.10684, "grad_norm": 2.328125, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 4.5628, "loss/crossentropy": 1.9704068899154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23617815226316452, "step": 5342 }, { "epoch": 0.10688, "grad_norm": 2.40625, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.214, "loss/crossentropy": 1.8539690971374512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218247577548027, "step": 5344 }, { "epoch": 0.10692, "grad_norm": 2.28125, "grad_norm_var": 0.012596638997395833, "learning_rate": 0.0001, "loss": 4.6077, "loss/crossentropy": 1.982038140296936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23828908801078796, "step": 5346 }, { "epoch": 0.10696, "grad_norm": 2.28125, "grad_norm_var": 0.013313802083333333, "learning_rate": 0.0001, "loss": 4.2879, "loss/crossentropy": 1.8247870802879333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22487633675336838, "step": 5348 }, { "epoch": 0.107, "grad_norm": 2.1875, "grad_norm_var": 0.011253865559895833, "learning_rate": 0.0001, "loss": 4.382, "loss/crossentropy": 2.0704214572906494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24332743138074875, "step": 5350 }, { "epoch": 0.10704, "grad_norm": 2.5, "grad_norm_var": 0.013509114583333334, "learning_rate": 0.0001, "loss": 4.709, "loss/crossentropy": 2.037345290184021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2515557184815407, "step": 5352 }, { "epoch": 0.10708, "grad_norm": 2.609375, "grad_norm_var": 0.020702107747395834, "learning_rate": 0.0001, "loss": 4.5427, "loss/crossentropy": 2.0561426877975464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24030926823616028, "step": 5354 }, { "epoch": 0.10712, "grad_norm": 2.3125, "grad_norm_var": 0.017943318684895834, "learning_rate": 0.0001, "loss": 4.3108, "loss/crossentropy": 1.6871100068092346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2526541203260422, "step": 5356 }, { "epoch": 0.10716, "grad_norm": 2.390625, "grad_norm_var": 0.01978759765625, "learning_rate": 0.0001, "loss": 4.5238, "loss/crossentropy": 2.0133201479911804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23663413524627686, "step": 5358 }, { "epoch": 0.1072, "grad_norm": 2.34375, "grad_norm_var": 0.0170562744140625, "learning_rate": 0.0001, "loss": 4.557, "loss/crossentropy": 2.0627574920654297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24395380914211273, "step": 5360 }, { "epoch": 0.10724, "grad_norm": 2.140625, "grad_norm_var": 0.020929972330729168, "learning_rate": 0.0001, "loss": 4.5495, "loss/crossentropy": 2.280818462371826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2806383967399597, "step": 5362 }, { "epoch": 0.10728, "grad_norm": 2.5, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 4.4743, "loss/crossentropy": 2.002636671066284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23986798524856567, "step": 5364 }, { "epoch": 0.10732, "grad_norm": 2.453125, "grad_norm_var": 0.022298177083333332, "learning_rate": 0.0001, "loss": 4.6816, "loss/crossentropy": 2.042721927165985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23199205100536346, "step": 5366 }, { "epoch": 0.10736, "grad_norm": 2.25, "grad_norm_var": 0.021540323893229168, "learning_rate": 0.0001, "loss": 4.3225, "loss/crossentropy": 2.1047908663749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23987916857004166, "step": 5368 }, { "epoch": 0.1074, "grad_norm": 2.1875, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 4.4827, "loss/crossentropy": 2.0513075590133667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242512047290802, "step": 5370 }, { "epoch": 0.10744, "grad_norm": 2.4375, "grad_norm_var": 0.015913899739583334, "learning_rate": 0.0001, "loss": 4.4452, "loss/crossentropy": 1.9151215553283691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23007020354270935, "step": 5372 }, { "epoch": 0.10748, "grad_norm": 2.125, "grad_norm_var": 0.019303385416666666, "learning_rate": 0.0001, "loss": 4.2344, "loss/crossentropy": 1.8759313821792603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068658247590065, "step": 5374 }, { "epoch": 0.10752, "grad_norm": 2.46875, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 4.4667, "loss/crossentropy": 2.1500572562217712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24392293393611908, "step": 5376 }, { "epoch": 0.10756, "grad_norm": 2.109375, "grad_norm_var": 0.018355305989583334, "learning_rate": 0.0001, "loss": 4.0917, "loss/crossentropy": 1.6089633703231812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20190145075321198, "step": 5378 }, { "epoch": 0.1076, "grad_norm": 2.1875, "grad_norm_var": 0.014452107747395833, "learning_rate": 0.0001, "loss": 4.6521, "loss/crossentropy": 2.1967561841011047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2749984338879585, "step": 5380 }, { "epoch": 0.10764, "grad_norm": 2.171875, "grad_norm_var": 0.011311848958333334, "learning_rate": 0.0001, "loss": 4.0843, "loss/crossentropy": 1.8293656706809998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20650158822536469, "step": 5382 }, { "epoch": 0.10768, "grad_norm": 2.125, "grad_norm_var": 0.011847941080729167, "learning_rate": 0.0001, "loss": 4.3441, "loss/crossentropy": 2.3964673280715942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2814205437898636, "step": 5384 }, { "epoch": 0.10772, "grad_norm": 2.4375, "grad_norm_var": 0.021842447916666667, "learning_rate": 0.0001, "loss": 4.8872, "loss/crossentropy": 2.4995274543762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28704003244638443, "step": 5386 }, { "epoch": 0.10776, "grad_norm": 2.296875, "grad_norm_var": 0.019462076822916667, "learning_rate": 0.0001, "loss": 4.6066, "loss/crossentropy": 2.0650060176849365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23606212437152863, "step": 5388 }, { "epoch": 0.1078, "grad_norm": 2.203125, "grad_norm_var": 0.01640625, "learning_rate": 0.0001, "loss": 4.3723, "loss/crossentropy": 2.3049341440200806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23550046980381012, "step": 5390 }, { "epoch": 0.10784, "grad_norm": 2.21875, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 4.813, "loss/crossentropy": 2.2687143087387085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2854095697402954, "step": 5392 }, { "epoch": 0.10788, "grad_norm": 2.28125, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 4.6267, "loss/crossentropy": 2.029325544834137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23444775491952896, "step": 5394 }, { "epoch": 0.10792, "grad_norm": 2.3125, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 4.5214, "loss/crossentropy": 2.2012031078338623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27986764907836914, "step": 5396 }, { "epoch": 0.10796, "grad_norm": 2.3125, "grad_norm_var": 0.012626139322916667, "learning_rate": 0.0001, "loss": 4.65, "loss/crossentropy": 2.2396020889282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24839117377996445, "step": 5398 }, { "epoch": 0.108, "grad_norm": 2.3125, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 4.4634, "loss/crossentropy": 2.1481886506080627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545444592833519, "step": 5400 }, { "epoch": 0.10804, "grad_norm": 2.21875, "grad_norm_var": 0.006005859375, "learning_rate": 0.0001, "loss": 4.6109, "loss/crossentropy": 1.9799351692199707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23296385258436203, "step": 5402 }, { "epoch": 0.10808, "grad_norm": 2.34375, "grad_norm_var": 0.006180826822916667, "learning_rate": 0.0001, "loss": 4.5612, "loss/crossentropy": 1.845237910747528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316332384943962, "step": 5404 }, { "epoch": 0.10812, "grad_norm": 2.234375, "grad_norm_var": 0.005692545572916667, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 2.078865647315979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251836359500885, "step": 5406 }, { "epoch": 0.10816, "grad_norm": 2.171875, "grad_norm_var": 0.006245930989583333, "learning_rate": 0.0001, "loss": 4.4409, "loss/crossentropy": 2.031971752643585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23658733069896698, "step": 5408 }, { "epoch": 0.1082, "grad_norm": 2.21875, "grad_norm_var": 0.00504150390625, "learning_rate": 0.0001, "loss": 4.3034, "loss/crossentropy": 1.8173908591270447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169174626469612, "step": 5410 }, { "epoch": 0.10824, "grad_norm": 2.25, "grad_norm_var": 0.0086090087890625, "learning_rate": 0.0001, "loss": 4.838, "loss/crossentropy": 2.2501285672187805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.240458145737648, "step": 5412 }, { "epoch": 0.10828, "grad_norm": 2.484375, "grad_norm_var": 0.015168253580729167, "learning_rate": 0.0001, "loss": 4.5449, "loss/crossentropy": 2.256573438644409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26129382848739624, "step": 5414 }, { "epoch": 0.10832, "grad_norm": 2.359375, "grad_norm_var": 0.0153717041015625, "learning_rate": 0.0001, "loss": 4.7704, "loss/crossentropy": 2.2014705538749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2334136888384819, "step": 5416 }, { "epoch": 0.10836, "grad_norm": 2.34375, "grad_norm_var": 0.013016764322916667, "learning_rate": 0.0001, "loss": 4.4046, "loss/crossentropy": 1.8590609431266785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2271011471748352, "step": 5418 }, { "epoch": 0.1084, "grad_norm": 2.40625, "grad_norm_var": 0.017378743489583334, "learning_rate": 0.0001, "loss": 4.9419, "loss/crossentropy": 2.2923961877822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25833888351917267, "step": 5420 }, { "epoch": 0.10844, "grad_norm": 2.21875, "grad_norm_var": 0.017723592122395833, "learning_rate": 0.0001, "loss": 4.4535, "loss/crossentropy": 2.1932299733161926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26209479570388794, "step": 5422 }, { "epoch": 0.10848, "grad_norm": 2.390625, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 4.7057, "loss/crossentropy": 2.3909924030303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2525275945663452, "step": 5424 }, { "epoch": 0.10852, "grad_norm": 2.296875, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 4.7509, "loss/crossentropy": 2.423817992210388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2635423541069031, "step": 5426 }, { "epoch": 0.10856, "grad_norm": 2.34375, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.7082, "loss/crossentropy": 1.9641632437705994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460889369249344, "step": 5428 }, { "epoch": 0.1086, "grad_norm": 2.34375, "grad_norm_var": 0.0077707926432291664, "learning_rate": 0.0001, "loss": 4.6347, "loss/crossentropy": 2.027769148349762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2357894703745842, "step": 5430 }, { "epoch": 0.10864, "grad_norm": 2.53125, "grad_norm_var": 0.0128570556640625, "learning_rate": 0.0001, "loss": 4.4833, "loss/crossentropy": 2.122319996356964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445123866200447, "step": 5432 }, { "epoch": 0.10868, "grad_norm": 3.53125, "grad_norm_var": 0.0993072509765625, "learning_rate": 0.0001, "loss": 4.6332, "loss/crossentropy": 1.8631052374839783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23456327617168427, "step": 5434 }, { "epoch": 0.10872, "grad_norm": 2.59375, "grad_norm_var": 0.1000885009765625, "learning_rate": 0.0001, "loss": 4.6022, "loss/crossentropy": 2.184281885623932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24073782563209534, "step": 5436 }, { "epoch": 0.10876, "grad_norm": 2.453125, "grad_norm_var": 0.09521077473958334, "learning_rate": 0.0001, "loss": 4.7912, "loss/crossentropy": 1.9587833881378174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22247321158647537, "step": 5438 }, { "epoch": 0.1088, "grad_norm": 2.296875, "grad_norm_var": 0.09562886555989583, "learning_rate": 0.0001, "loss": 4.5185, "loss/crossentropy": 2.334655284881592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24817728251218796, "step": 5440 }, { "epoch": 0.10884, "grad_norm": 2.109375, "grad_norm_var": 0.10161031087239583, "learning_rate": 0.0001, "loss": 4.3817, "loss/crossentropy": 2.1424371004104614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2622353136539459, "step": 5442 }, { "epoch": 0.10888, "grad_norm": 2.203125, "grad_norm_var": 0.10598958333333333, "learning_rate": 0.0001, "loss": 4.5876, "loss/crossentropy": 2.0363662242889404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2450125440955162, "step": 5444 }, { "epoch": 0.10892, "grad_norm": 2.1875, "grad_norm_var": 0.110791015625, "learning_rate": 0.0001, "loss": 4.4015, "loss/crossentropy": 2.0536006689071655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23283181339502335, "step": 5446 }, { "epoch": 0.10896, "grad_norm": 2.234375, "grad_norm_var": 0.10741780598958334, "learning_rate": 0.0001, "loss": 4.5194, "loss/crossentropy": 2.2678059339523315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24513862282037735, "step": 5448 }, { "epoch": 0.109, "grad_norm": 2.203125, "grad_norm_var": 0.02115478515625, "learning_rate": 0.0001, "loss": 4.7404, "loss/crossentropy": 2.406686782836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2901146113872528, "step": 5450 }, { "epoch": 0.10904, "grad_norm": 2.296875, "grad_norm_var": 0.017801920572916668, "learning_rate": 0.0001, "loss": 4.4724, "loss/crossentropy": 2.352605938911438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25215400755405426, "step": 5452 }, { "epoch": 0.10908, "grad_norm": 2.203125, "grad_norm_var": 0.014546712239583334, "learning_rate": 0.0001, "loss": 4.5593, "loss/crossentropy": 1.9139850735664368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22682765871286392, "step": 5454 }, { "epoch": 0.10912, "grad_norm": 2.171875, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 4.6163, "loss/crossentropy": 2.3240445852279663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24574412405490875, "step": 5456 }, { "epoch": 0.10916, "grad_norm": 2.21875, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 4.3356, "loss/crossentropy": 1.9347040057182312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159302830696106, "step": 5458 }, { "epoch": 0.1092, "grad_norm": 2.21875, "grad_norm_var": 0.012360636393229167, "learning_rate": 0.0001, "loss": 4.6539, "loss/crossentropy": 1.8933109641075134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.284725621342659, "step": 5460 }, { "epoch": 0.10924, "grad_norm": 2.15625, "grad_norm_var": 0.0129058837890625, "learning_rate": 0.0001, "loss": 4.2488, "loss/crossentropy": 2.3611297607421875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2651517689228058, "step": 5462 }, { "epoch": 0.10928, "grad_norm": 2.140625, "grad_norm_var": 0.0235504150390625, "learning_rate": 0.0001, "loss": 4.4564, "loss/crossentropy": 1.828608751296997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2337760180234909, "step": 5464 }, { "epoch": 0.10932, "grad_norm": 2.140625, "grad_norm_var": 0.015412394205729167, "learning_rate": 0.0001, "loss": 4.321, "loss/crossentropy": 2.1374374628067017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2525453567504883, "step": 5466 }, { "epoch": 0.10936, "grad_norm": 2.125, "grad_norm_var": 0.0150543212890625, "learning_rate": 0.0001, "loss": 4.5307, "loss/crossentropy": 1.8054441213607788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2252344712615013, "step": 5468 }, { "epoch": 0.1094, "grad_norm": 2.421875, "grad_norm_var": 0.022972615559895833, "learning_rate": 0.0001, "loss": 4.616, "loss/crossentropy": 2.1468498706817627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2604861631989479, "step": 5470 }, { "epoch": 0.10944, "grad_norm": 2.359375, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 4.7298, "loss/crossentropy": 2.2180548906326294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2603686898946762, "step": 5472 }, { "epoch": 0.10948, "grad_norm": 2.265625, "grad_norm_var": 0.022261555989583334, "learning_rate": 0.0001, "loss": 4.5263, "loss/crossentropy": 1.9773708581924438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23676057159900665, "step": 5474 }, { "epoch": 0.10952, "grad_norm": 2.40625, "grad_norm_var": 0.023485310872395835, "learning_rate": 0.0001, "loss": 4.6156, "loss/crossentropy": 1.9277283549308777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22822558879852295, "step": 5476 }, { "epoch": 0.10956, "grad_norm": 2.34375, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 4.5529, "loss/crossentropy": 2.0625431537628174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565220817923546, "step": 5478 }, { "epoch": 0.1096, "grad_norm": 2.25, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 4.7956, "loss/crossentropy": 2.383894443511963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563806623220444, "step": 5480 }, { "epoch": 0.10964, "grad_norm": 2.171875, "grad_norm_var": 0.0120758056640625, "learning_rate": 0.0001, "loss": 4.5226, "loss/crossentropy": 2.409442663192749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2637571170926094, "step": 5482 }, { "epoch": 0.10968, "grad_norm": 2.203125, "grad_norm_var": 0.0151763916015625, "learning_rate": 0.0001, "loss": 4.743, "loss/crossentropy": 2.1789854764938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23846548050642014, "step": 5484 }, { "epoch": 0.10972, "grad_norm": 2.265625, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 4.4108, "loss/crossentropy": 2.127842903137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2622772455215454, "step": 5486 }, { "epoch": 0.10976, "grad_norm": 2.1875, "grad_norm_var": 0.01412353515625, "learning_rate": 0.0001, "loss": 4.6032, "loss/crossentropy": 2.107556462287903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24714312702417374, "step": 5488 }, { "epoch": 0.1098, "grad_norm": 2.578125, "grad_norm_var": 0.022459920247395834, "learning_rate": 0.0001, "loss": 4.6525, "loss/crossentropy": 2.1959601640701294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23568396270275116, "step": 5490 }, { "epoch": 0.10984, "grad_norm": 2.171875, "grad_norm_var": 0.021826171875, "learning_rate": 0.0001, "loss": 4.5584, "loss/crossentropy": 2.1246761083602905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24083472788333893, "step": 5492 }, { "epoch": 0.10988, "grad_norm": 2.15625, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 4.6436, "loss/crossentropy": 2.091724157333374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527346611022949, "step": 5494 }, { "epoch": 0.10992, "grad_norm": 2.109375, "grad_norm_var": 0.026056925455729168, "learning_rate": 0.0001, "loss": 4.3207, "loss/crossentropy": 1.8898470997810364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916878432035446, "step": 5496 }, { "epoch": 0.10996, "grad_norm": 2.21875, "grad_norm_var": 0.025951131184895834, "learning_rate": 0.0001, "loss": 4.399, "loss/crossentropy": 2.1901716589927673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2409309297800064, "step": 5498 }, { "epoch": 0.11, "grad_norm": 2.234375, "grad_norm_var": 0.017626953125, "learning_rate": 0.0001, "loss": 4.6897, "loss/crossentropy": 2.1018574237823486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24559657275676727, "step": 5500 }, { "epoch": 0.11004, "grad_norm": 2.1875, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 3.8159, "loss/crossentropy": 2.0575350522994995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23721858859062195, "step": 5502 }, { "epoch": 0.11008, "grad_norm": 2.15625, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 4.6347, "loss/crossentropy": 2.1846336126327515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2378462702035904, "step": 5504 }, { "epoch": 0.11012, "grad_norm": 2.4375, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 4.4028, "loss/crossentropy": 2.1359363198280334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26211391389369965, "step": 5506 }, { "epoch": 0.11016, "grad_norm": 2.265625, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 4.5621, "loss/crossentropy": 2.236825942993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557792589068413, "step": 5508 }, { "epoch": 0.1102, "grad_norm": 2.25, "grad_norm_var": 0.009521484375, "learning_rate": 0.0001, "loss": 4.3234, "loss/crossentropy": 2.3140580654144287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2554958164691925, "step": 5510 }, { "epoch": 0.11024, "grad_norm": 2.140625, "grad_norm_var": 0.00904541015625, "learning_rate": 0.0001, "loss": 4.4382, "loss/crossentropy": 1.7190355062484741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19964009523391724, "step": 5512 }, { "epoch": 0.11028, "grad_norm": 2.203125, "grad_norm_var": 0.014631144205729167, "learning_rate": 0.0001, "loss": 4.3831, "loss/crossentropy": 1.8326427340507507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199154868721962, "step": 5514 }, { "epoch": 0.11032, "grad_norm": 2.109375, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 4.4494, "loss/crossentropy": 1.9013121724128723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20293578505516052, "step": 5516 }, { "epoch": 0.11036, "grad_norm": 2.1875, "grad_norm_var": 0.0138671875, "learning_rate": 0.0001, "loss": 4.7056, "loss/crossentropy": 2.0221983790397644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23435519635677338, "step": 5518 }, { "epoch": 0.1104, "grad_norm": 2.359375, "grad_norm_var": 0.0140533447265625, "learning_rate": 0.0001, "loss": 4.6157, "loss/crossentropy": 2.153970956802368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22127598524093628, "step": 5520 }, { "epoch": 0.11044, "grad_norm": 2.125, "grad_norm_var": 0.010835774739583333, "learning_rate": 0.0001, "loss": 4.4516, "loss/crossentropy": 1.8674496412277222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.232261061668396, "step": 5522 }, { "epoch": 0.11048, "grad_norm": 2.078125, "grad_norm_var": 0.012495930989583333, "learning_rate": 0.0001, "loss": 4.6528, "loss/crossentropy": 2.1575759649276733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22218701988458633, "step": 5524 }, { "epoch": 0.11052, "grad_norm": 2.25, "grad_norm_var": 0.012434895833333333, "learning_rate": 0.0001, "loss": 4.5553, "loss/crossentropy": 2.054452419281006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25137215852737427, "step": 5526 }, { "epoch": 0.11056, "grad_norm": 2.296875, "grad_norm_var": 0.022639973958333334, "learning_rate": 0.0001, "loss": 4.4789, "loss/crossentropy": 1.966478705406189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23289234936237335, "step": 5528 }, { "epoch": 0.1106, "grad_norm": 2.234375, "grad_norm_var": 0.017626953125, "learning_rate": 0.0001, "loss": 4.687, "loss/crossentropy": 2.171034336090088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23373593389987946, "step": 5530 }, { "epoch": 0.11064, "grad_norm": 1.953125, "grad_norm_var": 0.022362263997395833, "learning_rate": 0.0001, "loss": 4.142, "loss/crossentropy": 2.2416292428970337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24835532158613205, "step": 5532 }, { "epoch": 0.11068, "grad_norm": 2.53125, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 4.6252, "loss/crossentropy": 2.2599780559539795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24948878586292267, "step": 5534 }, { "epoch": 0.11072, "grad_norm": 2.34375, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 4.6984, "loss/crossentropy": 2.2292014360427856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26616473495960236, "step": 5536 }, { "epoch": 0.11076, "grad_norm": 2.375, "grad_norm_var": 0.027424112955729166, "learning_rate": 0.0001, "loss": 4.336, "loss/crossentropy": 1.9285388588905334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22964124381542206, "step": 5538 }, { "epoch": 0.1108, "grad_norm": 2.171875, "grad_norm_var": 0.0251953125, "learning_rate": 0.0001, "loss": 4.2861, "loss/crossentropy": 1.864789366722107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2264304906129837, "step": 5540 }, { "epoch": 0.11084, "grad_norm": 2.234375, "grad_norm_var": 0.0250885009765625, "learning_rate": 0.0001, "loss": 4.5163, "loss/crossentropy": 1.8676912188529968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21919699758291245, "step": 5542 }, { "epoch": 0.11088, "grad_norm": 2.234375, "grad_norm_var": 0.0190338134765625, "learning_rate": 0.0001, "loss": 4.3612, "loss/crossentropy": 2.34523469209671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25638002157211304, "step": 5544 }, { "epoch": 0.11092, "grad_norm": 2.09375, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 4.5165, "loss/crossentropy": 2.2903120517730713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25111711025238037, "step": 5546 }, { "epoch": 0.11096, "grad_norm": 2.15625, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 4.5336, "loss/crossentropy": 2.2106658220291138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22340595722198486, "step": 5548 }, { "epoch": 0.111, "grad_norm": 2.203125, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 4.6305, "loss/crossentropy": 2.0777581334114075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2446284517645836, "step": 5550 }, { "epoch": 0.11104, "grad_norm": 2.234375, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.5507, "loss/crossentropy": 2.131237506866455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24113191664218903, "step": 5552 }, { "epoch": 0.11108, "grad_norm": 2.578125, "grad_norm_var": 0.018778483072916668, "learning_rate": 0.0001, "loss": 4.6337, "loss/crossentropy": 2.190987467765808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2528213635087013, "step": 5554 }, { "epoch": 0.11112, "grad_norm": 2.25, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 4.4889, "loss/crossentropy": 2.26843523979187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27781064808368683, "step": 5556 }, { "epoch": 0.11116, "grad_norm": 2.25, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 4.2242, "loss/crossentropy": 1.9507999420166016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23172564804553986, "step": 5558 }, { "epoch": 0.1112, "grad_norm": 2.28125, "grad_norm_var": 0.016966756184895834, "learning_rate": 0.0001, "loss": 4.4821, "loss/crossentropy": 2.0738234519958496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24497026205062866, "step": 5560 }, { "epoch": 0.11124, "grad_norm": 2.515625, "grad_norm_var": 0.019261678059895832, "learning_rate": 0.0001, "loss": 4.9501, "loss/crossentropy": 2.273179054260254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2917838394641876, "step": 5562 }, { "epoch": 0.11128, "grad_norm": 2.359375, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 4.774, "loss/crossentropy": 2.085157036781311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2488701120018959, "step": 5564 }, { "epoch": 0.11132, "grad_norm": 3.109375, "grad_norm_var": 0.05950113932291667, "learning_rate": 0.0001, "loss": 4.2869, "loss/crossentropy": 2.0528116822242737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24717991054058075, "step": 5566 }, { "epoch": 0.11136, "grad_norm": 7.0, "grad_norm_var": 1.3981597900390625, "learning_rate": 0.0001, "loss": 4.4443, "loss/crossentropy": 2.0651500821113586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26806148886680603, "step": 5568 }, { "epoch": 0.1114, "grad_norm": 2.328125, "grad_norm_var": 1.4721588134765624, "learning_rate": 0.0001, "loss": 4.6162, "loss/crossentropy": 2.1860616207122803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23045460134744644, "step": 5570 }, { "epoch": 0.11144, "grad_norm": 2.328125, "grad_norm_var": 1.460399373372396, "learning_rate": 0.0001, "loss": 4.3629, "loss/crossentropy": 1.6931262016296387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20816385000944138, "step": 5572 }, { "epoch": 0.11148, "grad_norm": 2.28125, "grad_norm_var": 1.4581858317057292, "learning_rate": 0.0001, "loss": 4.3376, "loss/crossentropy": 2.199341118335724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2490597665309906, "step": 5574 }, { "epoch": 0.11152, "grad_norm": 2.484375, "grad_norm_var": 1.4561513264973958, "learning_rate": 0.0001, "loss": 4.6627, "loss/crossentropy": 2.2010069489479065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24095547199249268, "step": 5576 }, { "epoch": 0.11156, "grad_norm": 2.234375, "grad_norm_var": 1.4810129801432292, "learning_rate": 0.0001, "loss": 4.5243, "loss/crossentropy": 1.9907150864601135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122703418135643, "step": 5578 }, { "epoch": 0.1116, "grad_norm": 2.21875, "grad_norm_var": 1.5023508707682292, "learning_rate": 0.0001, "loss": 4.2618, "loss/crossentropy": 2.196335554122925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25111766904592514, "step": 5580 }, { "epoch": 0.11164, "grad_norm": 2.171875, "grad_norm_var": 1.5003000895182292, "learning_rate": 0.0001, "loss": 4.5104, "loss/crossentropy": 2.0762988924980164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23773670941591263, "step": 5582 }, { "epoch": 0.11168, "grad_norm": 2.140625, "grad_norm_var": 0.15530192057291667, "learning_rate": 0.0001, "loss": 4.3166, "loss/crossentropy": 2.0803143978118896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22162869572639465, "step": 5584 }, { "epoch": 0.11172, "grad_norm": 2.3125, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 4.5596, "loss/crossentropy": 2.1821994185447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26280316710472107, "step": 5586 }, { "epoch": 0.11176, "grad_norm": 2.171875, "grad_norm_var": 0.011847941080729167, "learning_rate": 0.0001, "loss": 4.4328, "loss/crossentropy": 2.1899439096450806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24677179753780365, "step": 5588 }, { "epoch": 0.1118, "grad_norm": 2.21875, "grad_norm_var": 0.0116851806640625, "learning_rate": 0.0001, "loss": 4.4361, "loss/crossentropy": 2.334734559059143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2481135129928589, "step": 5590 }, { "epoch": 0.11184, "grad_norm": 2.34375, "grad_norm_var": 0.007291666666666667, "learning_rate": 0.0001, "loss": 4.7838, "loss/crossentropy": 2.2976341247558594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2454354390501976, "step": 5592 }, { "epoch": 0.11188, "grad_norm": 2.3125, "grad_norm_var": 0.007477823893229167, "learning_rate": 0.0001, "loss": 4.7148, "loss/crossentropy": 2.3243749141693115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2735731601715088, "step": 5594 }, { "epoch": 0.11192, "grad_norm": 2.28125, "grad_norm_var": 0.007372029622395833, "learning_rate": 0.0001, "loss": 4.7124, "loss/crossentropy": 2.0328271985054016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21919244527816772, "step": 5596 }, { "epoch": 0.11196, "grad_norm": 2.234375, "grad_norm_var": 0.0077789306640625, "learning_rate": 0.0001, "loss": 4.4584, "loss/crossentropy": 2.249367594718933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2577860951423645, "step": 5598 }, { "epoch": 0.112, "grad_norm": 2.046875, "grad_norm_var": 0.009505208333333333, "learning_rate": 0.0001, "loss": 4.2594, "loss/crossentropy": 1.9844761490821838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142435386776924, "step": 5600 }, { "epoch": 0.11204, "grad_norm": 2.375, "grad_norm_var": 0.0111328125, "learning_rate": 0.0001, "loss": 4.6043, "loss/crossentropy": 2.1334372758865356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23296397179365158, "step": 5602 }, { "epoch": 0.11208, "grad_norm": 2.3125, "grad_norm_var": 0.0078084309895833336, "learning_rate": 0.0001, "loss": 4.5308, "loss/crossentropy": 2.119946002960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23398507386446, "step": 5604 }, { "epoch": 0.11212, "grad_norm": 2.25, "grad_norm_var": 0.008040364583333333, "learning_rate": 0.0001, "loss": 4.5011, "loss/crossentropy": 2.0414544343948364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23420867323875427, "step": 5606 }, { "epoch": 0.11216, "grad_norm": 2.125, "grad_norm_var": 0.009130859375, "learning_rate": 0.0001, "loss": 4.2092, "loss/crossentropy": 1.9592725038528442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23922864347696304, "step": 5608 }, { "epoch": 0.1122, "grad_norm": 2.4375, "grad_norm_var": 0.011188761393229166, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 2.150269627571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2528265416622162, "step": 5610 }, { "epoch": 0.11224, "grad_norm": 2.578125, "grad_norm_var": 0.3099772135416667, "learning_rate": 0.0001, "loss": 4.6234, "loss/crossentropy": 2.0591378211975098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2430240362882614, "step": 5612 }, { "epoch": 0.11228, "grad_norm": 2.171875, "grad_norm_var": 0.30794169108072916, "learning_rate": 0.0001, "loss": 4.4251, "loss/crossentropy": 2.2132861614227295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22676552087068558, "step": 5614 }, { "epoch": 0.11232, "grad_norm": 2.375, "grad_norm_var": 0.3002237955729167, "learning_rate": 0.0001, "loss": 4.4332, "loss/crossentropy": 1.9607917070388794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23068836331367493, "step": 5616 }, { "epoch": 0.11236, "grad_norm": 2.25, "grad_norm_var": 0.3021321614583333, "learning_rate": 0.0001, "loss": 4.549, "loss/crossentropy": 2.2245940566062927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25029121339321136, "step": 5618 }, { "epoch": 0.1124, "grad_norm": 2.234375, "grad_norm_var": 0.30278218587239586, "learning_rate": 0.0001, "loss": 4.4539, "loss/crossentropy": 2.2511253356933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25143079459667206, "step": 5620 }, { "epoch": 0.11244, "grad_norm": 2.328125, "grad_norm_var": 0.3026194254557292, "learning_rate": 0.0001, "loss": 4.4632, "loss/crossentropy": 2.2945470809936523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2196483463048935, "step": 5622 }, { "epoch": 0.11248, "grad_norm": 2.359375, "grad_norm_var": 0.29273681640625, "learning_rate": 0.0001, "loss": 4.8769, "loss/crossentropy": 2.2266393899917603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24020668864250183, "step": 5624 }, { "epoch": 0.11252, "grad_norm": 2.359375, "grad_norm_var": 0.29136454264322914, "learning_rate": 0.0001, "loss": 4.742, "loss/crossentropy": 2.2835845947265625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27199871838092804, "step": 5626 }, { "epoch": 0.11256, "grad_norm": 2.859375, "grad_norm_var": 0.026383463541666666, "learning_rate": 0.0001, "loss": 4.4213, "loss/crossentropy": 1.9576718211174011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21716968715190887, "step": 5628 }, { "epoch": 0.1126, "grad_norm": 2.375, "grad_norm_var": 0.044611612955729164, "learning_rate": 0.0001, "loss": 4.7695, "loss/crossentropy": 2.0955676436424255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24259109795093536, "step": 5630 }, { "epoch": 0.11264, "grad_norm": 2.5625, "grad_norm_var": 0.044840494791666664, "learning_rate": 0.0001, "loss": 4.4127, "loss/crossentropy": 2.119523346424103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2478165179491043, "step": 5632 }, { "epoch": 0.11268, "grad_norm": 2.265625, "grad_norm_var": 0.0444488525390625, "learning_rate": 0.0001, "loss": 4.5591, "loss/crossentropy": 2.189425826072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24532486498355865, "step": 5634 }, { "epoch": 0.11272, "grad_norm": 2.390625, "grad_norm_var": 0.0467193603515625, "learning_rate": 0.0001, "loss": 4.4969, "loss/crossentropy": 2.215874433517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24533041566610336, "step": 5636 }, { "epoch": 0.11276, "grad_norm": 2.515625, "grad_norm_var": 0.04868876139322917, "learning_rate": 0.0001, "loss": 4.5657, "loss/crossentropy": 2.226451873779297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545652836561203, "step": 5638 }, { "epoch": 0.1128, "grad_norm": 2.296875, "grad_norm_var": 0.05181884765625, "learning_rate": 0.0001, "loss": 4.4779, "loss/crossentropy": 2.0343592762947083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2503928989171982, "step": 5640 }, { "epoch": 0.11284, "grad_norm": 2.25, "grad_norm_var": 0.0535552978515625, "learning_rate": 0.0001, "loss": 4.6253, "loss/crossentropy": 2.142001748085022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24624411761760712, "step": 5642 }, { "epoch": 0.11288, "grad_norm": 2.140625, "grad_norm_var": 0.039383951822916666, "learning_rate": 0.0001, "loss": 4.4212, "loss/crossentropy": 1.822394609451294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21315739303827286, "step": 5644 }, { "epoch": 0.11292, "grad_norm": 2.140625, "grad_norm_var": 0.018863932291666666, "learning_rate": 0.0001, "loss": 4.1611, "loss/crossentropy": 2.3221731185913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24407917261123657, "step": 5646 }, { "epoch": 0.11296, "grad_norm": 2.234375, "grad_norm_var": 0.012531534830729166, "learning_rate": 0.0001, "loss": 4.5591, "loss/crossentropy": 1.9784467816352844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22578515857458115, "step": 5648 }, { "epoch": 0.113, "grad_norm": 2.328125, "grad_norm_var": 0.0142730712890625, "learning_rate": 0.0001, "loss": 4.636, "loss/crossentropy": 2.2148635387420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25031210482120514, "step": 5650 }, { "epoch": 0.11304, "grad_norm": 2.21875, "grad_norm_var": 0.012369791666666666, "learning_rate": 0.0001, "loss": 4.4644, "loss/crossentropy": 1.9204095602035522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22073335200548172, "step": 5652 }, { "epoch": 0.11308, "grad_norm": 2.421875, "grad_norm_var": 0.009764607747395833, "learning_rate": 0.0001, "loss": 4.6601, "loss/crossentropy": 2.092605173587799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161123484373093, "step": 5654 }, { "epoch": 0.11312, "grad_norm": 2.546875, "grad_norm_var": 0.0163726806640625, "learning_rate": 0.0001, "loss": 4.5977, "loss/crossentropy": 1.9546263217926025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22741927206516266, "step": 5656 }, { "epoch": 0.11316, "grad_norm": 2.28125, "grad_norm_var": 0.016974894205729167, "learning_rate": 0.0001, "loss": 4.6585, "loss/crossentropy": 2.256605863571167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560018301010132, "step": 5658 }, { "epoch": 0.1132, "grad_norm": 2.21875, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 4.3605, "loss/crossentropy": 2.24527370929718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.270521879196167, "step": 5660 }, { "epoch": 0.11324, "grad_norm": 2.5, "grad_norm_var": 0.014029947916666667, "learning_rate": 0.0001, "loss": 4.5429, "loss/crossentropy": 1.8154722452163696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22786997258663177, "step": 5662 }, { "epoch": 0.11328, "grad_norm": 2.359375, "grad_norm_var": 0.0143951416015625, "learning_rate": 0.0001, "loss": 4.4079, "loss/crossentropy": 2.135699689388275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253468282520771, "step": 5664 }, { "epoch": 0.11332, "grad_norm": 2.203125, "grad_norm_var": 0.015087890625, "learning_rate": 0.0001, "loss": 4.5381, "loss/crossentropy": 2.15896338224411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25651170313358307, "step": 5666 }, { "epoch": 0.11336, "grad_norm": 2.40625, "grad_norm_var": 0.012035115559895834, "learning_rate": 0.0001, "loss": 4.7877, "loss/crossentropy": 2.115864336490631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28546491265296936, "step": 5668 }, { "epoch": 0.1134, "grad_norm": 2.25, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 4.4283, "loss/crossentropy": 2.2036256790161133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2397892326116562, "step": 5670 }, { "epoch": 0.11344, "grad_norm": 2.234375, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 4.598, "loss/crossentropy": 2.4966647624969482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2984588146209717, "step": 5672 }, { "epoch": 0.11348, "grad_norm": 2.234375, "grad_norm_var": 0.008202107747395833, "learning_rate": 0.0001, "loss": 4.5289, "loss/crossentropy": 2.051860749721527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2748461365699768, "step": 5674 }, { "epoch": 0.11352, "grad_norm": 2.328125, "grad_norm_var": 0.007938639322916666, "learning_rate": 0.0001, "loss": 4.597, "loss/crossentropy": 2.046416461467743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320190668106079, "step": 5676 }, { "epoch": 0.11356, "grad_norm": 2.21875, "grad_norm_var": 0.0054514567057291664, "learning_rate": 0.0001, "loss": 4.7389, "loss/crossentropy": 2.2385342121124268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21859309077262878, "step": 5678 }, { "epoch": 0.1136, "grad_norm": 2.265625, "grad_norm_var": 0.006843058268229166, "learning_rate": 0.0001, "loss": 4.2487, "loss/crossentropy": 1.8511550426483154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22065181285142899, "step": 5680 }, { "epoch": 0.11364, "grad_norm": 2.828125, "grad_norm_var": 0.031477864583333334, "learning_rate": 0.0001, "loss": 4.642, "loss/crossentropy": 2.304056167602539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29507844150066376, "step": 5682 }, { "epoch": 0.11368, "grad_norm": 2.4375, "grad_norm_var": 0.03329671223958333, "learning_rate": 0.0001, "loss": 4.5519, "loss/crossentropy": 1.993275225162506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33826952427625656, "step": 5684 }, { "epoch": 0.11372, "grad_norm": 2.140625, "grad_norm_var": 0.03498433430989583, "learning_rate": 0.0001, "loss": 4.4729, "loss/crossentropy": 2.1836347579956055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24539872258901596, "step": 5686 }, { "epoch": 0.11376, "grad_norm": 2.296875, "grad_norm_var": 0.03504231770833333, "learning_rate": 0.0001, "loss": 4.5908, "loss/crossentropy": 1.9467885494232178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180013582110405, "step": 5688 }, { "epoch": 0.1138, "grad_norm": 2.78125, "grad_norm_var": 0.04907938639322917, "learning_rate": 0.0001, "loss": 4.2888, "loss/crossentropy": 1.9907563924789429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551623433828354, "step": 5690 }, { "epoch": 0.11384, "grad_norm": 2.21875, "grad_norm_var": 0.049153645833333336, "learning_rate": 0.0001, "loss": 4.2186, "loss/crossentropy": 1.9452654719352722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24263620376586914, "step": 5692 }, { "epoch": 0.11388, "grad_norm": 2.171875, "grad_norm_var": 0.050093587239583334, "learning_rate": 0.0001, "loss": 4.5816, "loss/crossentropy": 2.2448811531066895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2508034110069275, "step": 5694 }, { "epoch": 0.11392, "grad_norm": 2.65625, "grad_norm_var": 0.055712890625, "learning_rate": 0.0001, "loss": 4.1868, "loss/crossentropy": 1.991935908794403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22709138691425323, "step": 5696 }, { "epoch": 0.11396, "grad_norm": 2.625, "grad_norm_var": 0.47700907389322916, "learning_rate": 0.0001, "loss": 4.8208, "loss/crossentropy": 2.0479623675346375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980189859867096, "step": 5698 }, { "epoch": 0.114, "grad_norm": 2.15625, "grad_norm_var": 0.4779581705729167, "learning_rate": 0.0001, "loss": 4.4742, "loss/crossentropy": 1.9319151639938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295292615890503, "step": 5700 }, { "epoch": 0.11404, "grad_norm": 2.296875, "grad_norm_var": 0.47330322265625, "learning_rate": 0.0001, "loss": 4.32, "loss/crossentropy": 1.985447645187378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246543288230896, "step": 5702 }, { "epoch": 0.11408, "grad_norm": 2.84375, "grad_norm_var": 0.478271484375, "learning_rate": 0.0001, "loss": 4.6602, "loss/crossentropy": 2.0016521215438843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25966253876686096, "step": 5704 }, { "epoch": 0.11412, "grad_norm": 2.421875, "grad_norm_var": 0.47038472493489586, "learning_rate": 0.0001, "loss": 4.9207, "loss/crossentropy": 2.112374246120453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3251144737005234, "step": 5706 }, { "epoch": 0.11416, "grad_norm": 2.328125, "grad_norm_var": 0.4698720296223958, "learning_rate": 0.0001, "loss": 4.5223, "loss/crossentropy": 2.0931158661842346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275928407907486, "step": 5708 }, { "epoch": 0.1142, "grad_norm": 2.328125, "grad_norm_var": 0.46104227701822914, "learning_rate": 0.0001, "loss": 4.4481, "loss/crossentropy": 1.9867743849754333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2419440597295761, "step": 5710 }, { "epoch": 0.11424, "grad_norm": 2.171875, "grad_norm_var": 0.4639312744140625, "learning_rate": 0.0001, "loss": 4.3874, "loss/crossentropy": 2.1822216510772705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437271624803543, "step": 5712 }, { "epoch": 0.11428, "grad_norm": 6.46875, "grad_norm_var": 1.1088775634765624, "learning_rate": 0.0001, "loss": 4.5307, "loss/crossentropy": 2.412580370903015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3222763389348984, "step": 5714 }, { "epoch": 0.11432, "grad_norm": 2.28125, "grad_norm_var": 1.1019490559895833, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 1.676392376422882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24499019235372543, "step": 5716 }, { "epoch": 0.11436, "grad_norm": 2.328125, "grad_norm_var": 1.1039876302083333, "learning_rate": 0.0001, "loss": 4.4741, "loss/crossentropy": 1.7818856835365295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22935030609369278, "step": 5718 }, { "epoch": 0.1144, "grad_norm": 2.40625, "grad_norm_var": 1.0944732666015624, "learning_rate": 0.0001, "loss": 4.5445, "loss/crossentropy": 2.012014925479889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231942281126976, "step": 5720 }, { "epoch": 0.11444, "grad_norm": 2.296875, "grad_norm_var": 1.1023834228515625, "learning_rate": 0.0001, "loss": 4.5061, "loss/crossentropy": 2.3663965463638306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.254768967628479, "step": 5722 }, { "epoch": 0.11448, "grad_norm": 2.34375, "grad_norm_var": 1.107982381184896, "learning_rate": 0.0001, "loss": 4.5878, "loss/crossentropy": 2.343206286430359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26609115302562714, "step": 5724 }, { "epoch": 0.11452, "grad_norm": 2.21875, "grad_norm_var": 1.1131337483723958, "learning_rate": 0.0001, "loss": 4.5047, "loss/crossentropy": 1.8696978092193604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24282050132751465, "step": 5726 }, { "epoch": 0.11456, "grad_norm": 2.0625, "grad_norm_var": 1.1170644124348958, "learning_rate": 0.0001, "loss": 4.2198, "loss/crossentropy": 2.1430450677871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23211582750082016, "step": 5728 }, { "epoch": 0.1146, "grad_norm": 2.421875, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 4.6868, "loss/crossentropy": 2.2231308221817017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23736387491226196, "step": 5730 }, { "epoch": 0.11464, "grad_norm": 2.421875, "grad_norm_var": 0.014069620768229167, "learning_rate": 0.0001, "loss": 4.4911, "loss/crossentropy": 1.8789280652999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24081497639417648, "step": 5732 }, { "epoch": 0.11468, "grad_norm": 2.1875, "grad_norm_var": 0.014875284830729167, "learning_rate": 0.0001, "loss": 4.4478, "loss/crossentropy": 2.0491732358932495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20530463755130768, "step": 5734 }, { "epoch": 0.11472, "grad_norm": 2.296875, "grad_norm_var": 0.012516276041666666, "learning_rate": 0.0001, "loss": 4.4352, "loss/crossentropy": 2.067046642303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24701344966888428, "step": 5736 }, { "epoch": 0.11476, "grad_norm": 2.234375, "grad_norm_var": 0.012791951497395834, "learning_rate": 0.0001, "loss": 4.4515, "loss/crossentropy": 2.0207647681236267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243770807981491, "step": 5738 }, { "epoch": 0.1148, "grad_norm": 2.21875, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 4.9205, "loss/crossentropy": 2.230514347553253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24377264082431793, "step": 5740 }, { "epoch": 0.11484, "grad_norm": 2.078125, "grad_norm_var": 0.013997395833333334, "learning_rate": 0.0001, "loss": 4.3831, "loss/crossentropy": 2.38068687915802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24760407954454422, "step": 5742 }, { "epoch": 0.11488, "grad_norm": 2.3125, "grad_norm_var": 0.012483723958333333, "learning_rate": 0.0001, "loss": 4.5598, "loss/crossentropy": 2.238909125328064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2503668889403343, "step": 5744 }, { "epoch": 0.11492, "grad_norm": 2.1875, "grad_norm_var": 0.015265909830729167, "learning_rate": 0.0001, "loss": 4.4848, "loss/crossentropy": 1.7423101663589478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20424582809209824, "step": 5746 }, { "epoch": 0.11496, "grad_norm": 2.25, "grad_norm_var": 0.0181304931640625, "learning_rate": 0.0001, "loss": 4.6754, "loss/crossentropy": 2.5906827449798584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24731651693582535, "step": 5748 }, { "epoch": 0.115, "grad_norm": 2.6875, "grad_norm_var": 0.025830078125, "learning_rate": 0.0001, "loss": 4.7427, "loss/crossentropy": 2.418861746788025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505089193582535, "step": 5750 }, { "epoch": 0.11504, "grad_norm": 2.765625, "grad_norm_var": 0.03658447265625, "learning_rate": 0.0001, "loss": 4.869, "loss/crossentropy": 2.158658504486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2381347045302391, "step": 5752 }, { "epoch": 0.11508, "grad_norm": 2.40625, "grad_norm_var": 0.03860270182291667, "learning_rate": 0.0001, "loss": 4.5073, "loss/crossentropy": 2.0938435196876526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639065027236938, "step": 5754 }, { "epoch": 0.11512, "grad_norm": 2.046875, "grad_norm_var": 0.0441314697265625, "learning_rate": 0.0001, "loss": 4.2227, "loss/crossentropy": 2.023799479007721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24510329961776733, "step": 5756 }, { "epoch": 0.11516, "grad_norm": 2.265625, "grad_norm_var": 0.03854166666666667, "learning_rate": 0.0001, "loss": 4.5155, "loss/crossentropy": 2.3589184284210205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2778936177492142, "step": 5758 }, { "epoch": 0.1152, "grad_norm": 2.1875, "grad_norm_var": 0.041731770833333334, "learning_rate": 0.0001, "loss": 4.4209, "loss/crossentropy": 2.4897998571395874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23546544462442398, "step": 5760 }, { "epoch": 0.11524, "grad_norm": 2.1875, "grad_norm_var": 0.041747029622395834, "learning_rate": 0.0001, "loss": 4.4619, "loss/crossentropy": 2.0426196455955505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22708147019147873, "step": 5762 }, { "epoch": 0.11528, "grad_norm": 2.28125, "grad_norm_var": 0.0397369384765625, "learning_rate": 0.0001, "loss": 4.6661, "loss/crossentropy": 2.2582051753997803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25646351277828217, "step": 5764 }, { "epoch": 0.11532, "grad_norm": 2.21875, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 4.5929, "loss/crossentropy": 2.181105613708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2380223199725151, "step": 5766 }, { "epoch": 0.11536, "grad_norm": 2.46875, "grad_norm_var": 0.026200358072916666, "learning_rate": 0.0001, "loss": 4.5078, "loss/crossentropy": 2.236580967903137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26678355038166046, "step": 5768 }, { "epoch": 0.1154, "grad_norm": 2.203125, "grad_norm_var": 0.020361328125, "learning_rate": 0.0001, "loss": 4.6232, "loss/crossentropy": 1.9383749961853027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24920085072517395, "step": 5770 }, { "epoch": 0.11544, "grad_norm": 2.15625, "grad_norm_var": 0.018529256184895832, "learning_rate": 0.0001, "loss": 4.6044, "loss/crossentropy": 2.2856688499450684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.258881650865078, "step": 5772 }, { "epoch": 0.11548, "grad_norm": 2.28125, "grad_norm_var": 0.0239410400390625, "learning_rate": 0.0001, "loss": 4.8391, "loss/crossentropy": 2.2897390127182007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24453644454479218, "step": 5774 }, { "epoch": 0.11552, "grad_norm": 2.25, "grad_norm_var": 0.014549763997395833, "learning_rate": 0.0001, "loss": 4.5698, "loss/crossentropy": 2.0502785444259644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24106161296367645, "step": 5776 }, { "epoch": 0.11556, "grad_norm": 2.21875, "grad_norm_var": 0.01441650390625, "learning_rate": 0.0001, "loss": 4.5665, "loss/crossentropy": 2.344050645828247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24690424650907516, "step": 5778 }, { "epoch": 0.1156, "grad_norm": 2.609375, "grad_norm_var": 0.021239217122395834, "learning_rate": 0.0001, "loss": 4.5273, "loss/crossentropy": 2.2274389266967773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25618284940719604, "step": 5780 }, { "epoch": 0.11564, "grad_norm": 2.203125, "grad_norm_var": 0.022184244791666665, "learning_rate": 0.0001, "loss": 4.4888, "loss/crossentropy": 2.063184678554535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22606099396944046, "step": 5782 }, { "epoch": 0.11568, "grad_norm": 2.203125, "grad_norm_var": 0.0206451416015625, "learning_rate": 0.0001, "loss": 4.2333, "loss/crossentropy": 2.093947410583496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23588553071022034, "step": 5784 }, { "epoch": 0.11572, "grad_norm": 2.015625, "grad_norm_var": 0.0246490478515625, "learning_rate": 0.0001, "loss": 4.426, "loss/crossentropy": 2.1599318981170654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20322780311107635, "step": 5786 }, { "epoch": 0.11576, "grad_norm": 2.40625, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 4.5398, "loss/crossentropy": 2.274693012237549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2704206556081772, "step": 5788 }, { "epoch": 0.1158, "grad_norm": 2.421875, "grad_norm_var": 0.0222076416015625, "learning_rate": 0.0001, "loss": 4.6755, "loss/crossentropy": 1.9712103009223938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28300437331199646, "step": 5790 }, { "epoch": 0.11584, "grad_norm": 2.09375, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 4.2035, "loss/crossentropy": 2.027747690677643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24419110268354416, "step": 5792 }, { "epoch": 0.11588, "grad_norm": 2.34375, "grad_norm_var": 0.024251302083333332, "learning_rate": 0.0001, "loss": 4.5614, "loss/crossentropy": 2.162364959716797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2522001415491104, "step": 5794 }, { "epoch": 0.11592, "grad_norm": 2.28125, "grad_norm_var": 0.0166656494140625, "learning_rate": 0.0001, "loss": 4.4711, "loss/crossentropy": 2.558881998062134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26168718934059143, "step": 5796 }, { "epoch": 0.11596, "grad_norm": 2.125, "grad_norm_var": 0.0162994384765625, "learning_rate": 0.0001, "loss": 4.4108, "loss/crossentropy": 2.1027071475982666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24049362540245056, "step": 5798 }, { "epoch": 0.116, "grad_norm": 2.234375, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 4.6059, "loss/crossentropy": 2.488932490348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697457820177078, "step": 5800 }, { "epoch": 0.11604, "grad_norm": 2.375, "grad_norm_var": 0.0121734619140625, "learning_rate": 0.0001, "loss": 4.6706, "loss/crossentropy": 2.0441418886184692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23980429768562317, "step": 5802 }, { "epoch": 0.11608, "grad_norm": 2.375, "grad_norm_var": 0.0107421875, "learning_rate": 0.0001, "loss": 4.8471, "loss/crossentropy": 2.2873395681381226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3105090409517288, "step": 5804 }, { "epoch": 0.11612, "grad_norm": 2.265625, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 4.5388, "loss/crossentropy": 1.8773444890975952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20640414953231812, "step": 5806 }, { "epoch": 0.11616, "grad_norm": 2.375, "grad_norm_var": 0.007502237955729167, "learning_rate": 0.0001, "loss": 4.7338, "loss/crossentropy": 2.3736027479171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2679155319929123, "step": 5808 }, { "epoch": 0.1162, "grad_norm": 2.28125, "grad_norm_var": 0.010676066080729166, "learning_rate": 0.0001, "loss": 4.0496, "loss/crossentropy": 2.071690082550049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21868111193180084, "step": 5810 }, { "epoch": 0.11624, "grad_norm": 2.390625, "grad_norm_var": 0.015425618489583333, "learning_rate": 0.0001, "loss": 4.4029, "loss/crossentropy": 2.0531184673309326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24457989633083344, "step": 5812 }, { "epoch": 0.11628, "grad_norm": 2.3125, "grad_norm_var": 0.015718587239583335, "learning_rate": 0.0001, "loss": 4.6511, "loss/crossentropy": 2.1161271929740906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22809523344039917, "step": 5814 }, { "epoch": 0.11632, "grad_norm": 2.046875, "grad_norm_var": 0.0196929931640625, "learning_rate": 0.0001, "loss": 4.2736, "loss/crossentropy": 1.6557151675224304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20782940834760666, "step": 5816 }, { "epoch": 0.11636, "grad_norm": 2.3125, "grad_norm_var": 0.03704020182291667, "learning_rate": 0.0001, "loss": 4.5977, "loss/crossentropy": 2.2499040365219116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713513821363449, "step": 5818 }, { "epoch": 0.1164, "grad_norm": 2.390625, "grad_norm_var": 0.03723958333333333, "learning_rate": 0.0001, "loss": 4.6417, "loss/crossentropy": 2.3616446256637573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2613145411014557, "step": 5820 }, { "epoch": 0.11644, "grad_norm": 2.265625, "grad_norm_var": 0.0353515625, "learning_rate": 0.0001, "loss": 4.3744, "loss/crossentropy": 2.0932790637016296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23843754082918167, "step": 5822 }, { "epoch": 0.11648, "grad_norm": 2.25, "grad_norm_var": 0.03540751139322917, "learning_rate": 0.0001, "loss": 4.4089, "loss/crossentropy": 2.128177046775818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26579540967941284, "step": 5824 }, { "epoch": 0.11652, "grad_norm": 2.296875, "grad_norm_var": 0.031281534830729166, "learning_rate": 0.0001, "loss": 4.6942, "loss/crossentropy": 2.332372784614563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27953268587589264, "step": 5826 }, { "epoch": 0.11656, "grad_norm": 2.1875, "grad_norm_var": 0.0263092041015625, "learning_rate": 0.0001, "loss": 4.4615, "loss/crossentropy": 2.360959053039551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26249848306179047, "step": 5828 }, { "epoch": 0.1166, "grad_norm": 2.0, "grad_norm_var": 0.030745442708333334, "learning_rate": 0.0001, "loss": 3.9757, "loss/crossentropy": 1.9800339341163635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22766301035881042, "step": 5830 }, { "epoch": 0.11664, "grad_norm": 2.390625, "grad_norm_var": 0.029442342122395833, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 2.082980155944824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2368428260087967, "step": 5832 }, { "epoch": 0.11668, "grad_norm": 2.34375, "grad_norm_var": 0.011986287434895833, "learning_rate": 0.0001, "loss": 4.4441, "loss/crossentropy": 2.093027710914612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21121951937675476, "step": 5834 }, { "epoch": 0.11672, "grad_norm": 2.296875, "grad_norm_var": 0.011031087239583333, "learning_rate": 0.0001, "loss": 4.5008, "loss/crossentropy": 2.1329175233840942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25303877890110016, "step": 5836 }, { "epoch": 0.11676, "grad_norm": 2.21875, "grad_norm_var": 0.022932942708333334, "learning_rate": 0.0001, "loss": 4.5174, "loss/crossentropy": 1.79305762052536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20448636263608932, "step": 5838 }, { "epoch": 0.1168, "grad_norm": 2.28125, "grad_norm_var": 0.024933878580729166, "learning_rate": 0.0001, "loss": 4.2722, "loss/crossentropy": 1.957836627960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22745755314826965, "step": 5840 }, { "epoch": 0.11684, "grad_norm": 2.25, "grad_norm_var": 0.025007120768229165, "learning_rate": 0.0001, "loss": 4.3773, "loss/crossentropy": 2.1167174577713013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437898963689804, "step": 5842 }, { "epoch": 0.11688, "grad_norm": 2.1875, "grad_norm_var": 0.024800618489583332, "learning_rate": 0.0001, "loss": 4.4856, "loss/crossentropy": 2.3288447856903076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355937883257866, "step": 5844 }, { "epoch": 0.11692, "grad_norm": 2.125, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 4.4774, "loss/crossentropy": 2.2526416778564453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24917340278625488, "step": 5846 }, { "epoch": 0.11696, "grad_norm": 2.25, "grad_norm_var": 0.0198883056640625, "learning_rate": 0.0001, "loss": 4.391, "loss/crossentropy": 2.194224774837494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2625589966773987, "step": 5848 }, { "epoch": 0.117, "grad_norm": 2.34375, "grad_norm_var": 0.020116170247395832, "learning_rate": 0.0001, "loss": 4.4955, "loss/crossentropy": 2.0156877040863037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22422078251838684, "step": 5850 }, { "epoch": 0.11704, "grad_norm": 2.21875, "grad_norm_var": 0.01962890625, "learning_rate": 0.0001, "loss": 4.6231, "loss/crossentropy": 2.2480785846710205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24705884605646133, "step": 5852 }, { "epoch": 0.11708, "grad_norm": 2.25, "grad_norm_var": 0.0078521728515625, "learning_rate": 0.0001, "loss": 4.4817, "loss/crossentropy": 2.0915993452072144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24669666588306427, "step": 5854 }, { "epoch": 0.11712, "grad_norm": 2.171875, "grad_norm_var": 0.0062652587890625, "learning_rate": 0.0001, "loss": 4.4444, "loss/crossentropy": 2.1283876299858093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24228712916374207, "step": 5856 }, { "epoch": 0.11716, "grad_norm": 2.625, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.028861939907074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2281404584646225, "step": 5858 }, { "epoch": 0.1172, "grad_norm": 2.15625, "grad_norm_var": 0.015852864583333334, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.154610753059387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24233703315258026, "step": 5860 }, { "epoch": 0.11724, "grad_norm": 2.421875, "grad_norm_var": 0.015787760416666668, "learning_rate": 0.0001, "loss": 4.5059, "loss/crossentropy": 1.9396602511405945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22303076088428497, "step": 5862 }, { "epoch": 0.11728, "grad_norm": 2.578125, "grad_norm_var": 0.021061197916666666, "learning_rate": 0.0001, "loss": 4.7447, "loss/crossentropy": 2.053893029689789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26897912472486496, "step": 5864 }, { "epoch": 0.11732, "grad_norm": 2.0625, "grad_norm_var": 0.0263336181640625, "learning_rate": 0.0001, "loss": 4.3529, "loss/crossentropy": 1.990949273109436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22180908918380737, "step": 5866 }, { "epoch": 0.11736, "grad_norm": 2.453125, "grad_norm_var": 0.030085245768229168, "learning_rate": 0.0001, "loss": 4.6434, "loss/crossentropy": 2.0929455161094666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24055248498916626, "step": 5868 }, { "epoch": 0.1174, "grad_norm": 2.46875, "grad_norm_var": 0.03168843587239583, "learning_rate": 0.0001, "loss": 4.5465, "loss/crossentropy": 2.1476733684539795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2416895627975464, "step": 5870 }, { "epoch": 0.11744, "grad_norm": 2.21875, "grad_norm_var": 0.027962239583333333, "learning_rate": 0.0001, "loss": 4.5633, "loss/crossentropy": 1.74330335855484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001302987337112, "step": 5872 }, { "epoch": 0.11748, "grad_norm": 2.265625, "grad_norm_var": 0.021507771809895833, "learning_rate": 0.0001, "loss": 4.2725, "loss/crossentropy": 1.8903921246528625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21950766444206238, "step": 5874 }, { "epoch": 0.11752, "grad_norm": 2.140625, "grad_norm_var": 0.022069295247395832, "learning_rate": 0.0001, "loss": 4.3673, "loss/crossentropy": 1.798406720161438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20271167159080505, "step": 5876 }, { "epoch": 0.11756, "grad_norm": 2.25, "grad_norm_var": 0.021089680989583335, "learning_rate": 0.0001, "loss": 4.7128, "loss/crossentropy": 1.9651959538459778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256517618894577, "step": 5878 }, { "epoch": 0.1176, "grad_norm": 2.109375, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 4.4158, "loss/crossentropy": 2.039245307445526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23225219547748566, "step": 5880 }, { "epoch": 0.11764, "grad_norm": 2.40625, "grad_norm_var": 0.013329060872395833, "learning_rate": 0.0001, "loss": 4.8741, "loss/crossentropy": 2.500381350517273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31757834553718567, "step": 5882 }, { "epoch": 0.11768, "grad_norm": 2.59375, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 4.6463, "loss/crossentropy": 2.0540305972099304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2474198415875435, "step": 5884 }, { "epoch": 0.11772, "grad_norm": 2.359375, "grad_norm_var": 0.016462198893229165, "learning_rate": 0.0001, "loss": 4.5047, "loss/crossentropy": 2.072624385356903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2482014298439026, "step": 5886 }, { "epoch": 0.11776, "grad_norm": 2.328125, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 4.4025, "loss/crossentropy": 2.153423309326172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234448604285717, "step": 5888 }, { "epoch": 0.1178, "grad_norm": 2.5625, "grad_norm_var": 0.025325520833333334, "learning_rate": 0.0001, "loss": 4.2698, "loss/crossentropy": 1.5941627621650696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20982372760772705, "step": 5890 }, { "epoch": 0.11784, "grad_norm": 2.390625, "grad_norm_var": 0.025886027018229167, "learning_rate": 0.0001, "loss": 4.5168, "loss/crossentropy": 2.2858930826187134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24936140328645706, "step": 5892 }, { "epoch": 0.11788, "grad_norm": 2.296875, "grad_norm_var": 0.028076171875, "learning_rate": 0.0001, "loss": 4.536, "loss/crossentropy": 2.333559274673462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26152122020721436, "step": 5894 }, { "epoch": 0.11792, "grad_norm": 2.296875, "grad_norm_var": 0.027106730143229167, "learning_rate": 0.0001, "loss": 4.412, "loss/crossentropy": 2.134613037109375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24929091334342957, "step": 5896 }, { "epoch": 0.11796, "grad_norm": 2.296875, "grad_norm_var": 0.026423136393229168, "learning_rate": 0.0001, "loss": 4.4052, "loss/crossentropy": 2.179764688014984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24523546546697617, "step": 5898 }, { "epoch": 0.118, "grad_norm": 2.109375, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 4.3431, "loss/crossentropy": 2.1184223294258118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21715932339429855, "step": 5900 }, { "epoch": 0.11804, "grad_norm": 2.078125, "grad_norm_var": 0.021882120768229166, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.024593770503998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23016374558210373, "step": 5902 }, { "epoch": 0.11808, "grad_norm": 2.265625, "grad_norm_var": 0.0204986572265625, "learning_rate": 0.0001, "loss": 4.3646, "loss/crossentropy": 2.077186107635498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2538782134652138, "step": 5904 }, { "epoch": 0.11812, "grad_norm": 2.359375, "grad_norm_var": 0.013190714518229167, "learning_rate": 0.0001, "loss": 4.9135, "loss/crossentropy": 2.2535945177078247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25025132298469543, "step": 5906 }, { "epoch": 0.11816, "grad_norm": 2.21875, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 4.5512, "loss/crossentropy": 2.5321284532546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729681059718132, "step": 5908 }, { "epoch": 0.1182, "grad_norm": 2.234375, "grad_norm_var": 0.006403605143229167, "learning_rate": 0.0001, "loss": 4.332, "loss/crossentropy": 2.043885111808777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23721691966056824, "step": 5910 }, { "epoch": 0.11824, "grad_norm": 2.296875, "grad_norm_var": 0.005204264322916667, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 2.1343676447868347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23325734585523605, "step": 5912 }, { "epoch": 0.11828, "grad_norm": 2.03125, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 4.3551, "loss/crossentropy": 2.1164477467536926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24526448547840118, "step": 5914 }, { "epoch": 0.11832, "grad_norm": 2.1875, "grad_norm_var": 0.0064737955729166664, "learning_rate": 0.0001, "loss": 4.5162, "loss/crossentropy": 2.268216848373413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24837128818035126, "step": 5916 }, { "epoch": 0.11836, "grad_norm": 2.25, "grad_norm_var": 0.0057037353515625, "learning_rate": 0.0001, "loss": 4.4636, "loss/crossentropy": 2.074695885181427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24956009536981583, "step": 5918 }, { "epoch": 0.1184, "grad_norm": 2.203125, "grad_norm_var": 0.007372029622395833, "learning_rate": 0.0001, "loss": 4.3165, "loss/crossentropy": 1.9369722604751587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316487580537796, "step": 5920 }, { "epoch": 0.11844, "grad_norm": 2.15625, "grad_norm_var": 0.00611572265625, "learning_rate": 0.0001, "loss": 4.2789, "loss/crossentropy": 2.2189531326293945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23884039372205734, "step": 5922 }, { "epoch": 0.11848, "grad_norm": 2.34375, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 4.5965, "loss/crossentropy": 2.3833028078079224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2504591718316078, "step": 5924 }, { "epoch": 0.11852, "grad_norm": 2.234375, "grad_norm_var": 0.007005818684895833, "learning_rate": 0.0001, "loss": 4.6346, "loss/crossentropy": 2.0443845987319946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23422807455062866, "step": 5926 }, { "epoch": 0.11856, "grad_norm": 2.203125, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 4.9061, "loss/crossentropy": 2.223625063896179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24985776841640472, "step": 5928 }, { "epoch": 0.1186, "grad_norm": 2.171875, "grad_norm_var": 0.005126953125, "learning_rate": 0.0001, "loss": 4.493, "loss/crossentropy": 2.1353545784950256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22846727818250656, "step": 5930 }, { "epoch": 0.11864, "grad_norm": 2.203125, "grad_norm_var": 0.005159505208333333, "learning_rate": 0.0001, "loss": 4.5786, "loss/crossentropy": 2.0497827529907227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21461189538240433, "step": 5932 }, { "epoch": 0.11868, "grad_norm": 2.28125, "grad_norm_var": 0.004715983072916667, "learning_rate": 0.0001, "loss": 4.5071, "loss/crossentropy": 1.9257362484931946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216043472290039, "step": 5934 }, { "epoch": 0.11872, "grad_norm": 2.171875, "grad_norm_var": 0.0034006754557291668, "learning_rate": 0.0001, "loss": 4.4212, "loss/crossentropy": 2.0458216071128845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23162957280874252, "step": 5936 }, { "epoch": 0.11876, "grad_norm": 2.1875, "grad_norm_var": 0.0033274332682291666, "learning_rate": 0.0001, "loss": 4.6109, "loss/crossentropy": 2.0786396861076355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22303655743598938, "step": 5938 }, { "epoch": 0.1188, "grad_norm": 2.546875, "grad_norm_var": 0.04327799479166667, "learning_rate": 0.0001, "loss": 4.4861, "loss/crossentropy": 1.9800568222999573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22236331552267075, "step": 5940 }, { "epoch": 0.11884, "grad_norm": 2.0625, "grad_norm_var": 0.0502349853515625, "learning_rate": 0.0001, "loss": 4.056, "loss/crossentropy": 1.882250189781189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21664132922887802, "step": 5942 }, { "epoch": 0.11888, "grad_norm": 2.203125, "grad_norm_var": 0.05025634765625, "learning_rate": 0.0001, "loss": 4.5965, "loss/crossentropy": 2.3682695627212524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2698971778154373, "step": 5944 }, { "epoch": 0.11892, "grad_norm": 2.265625, "grad_norm_var": 0.0505523681640625, "learning_rate": 0.0001, "loss": 4.6364, "loss/crossentropy": 2.225574493408203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2642917186021805, "step": 5946 }, { "epoch": 0.11896, "grad_norm": 2.1875, "grad_norm_var": 0.05032145182291667, "learning_rate": 0.0001, "loss": 4.3157, "loss/crossentropy": 1.8634169697761536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170402556657791, "step": 5948 }, { "epoch": 0.119, "grad_norm": 2.3125, "grad_norm_var": 0.05123291015625, "learning_rate": 0.0001, "loss": 5.0392, "loss/crossentropy": 2.4500025510787964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27845144271850586, "step": 5950 }, { "epoch": 0.11904, "grad_norm": 2.046875, "grad_norm_var": 0.054108683268229166, "learning_rate": 0.0001, "loss": 4.1179, "loss/crossentropy": 2.1532052755355835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24952176213264465, "step": 5952 }, { "epoch": 0.11908, "grad_norm": 2.140625, "grad_norm_var": 0.0561676025390625, "learning_rate": 0.0001, "loss": 4.2592, "loss/crossentropy": 2.066560387611389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356283888220787, "step": 5954 }, { "epoch": 0.11912, "grad_norm": 2.265625, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 4.6187, "loss/crossentropy": 1.9679089784622192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23729706555604935, "step": 5956 }, { "epoch": 0.11916, "grad_norm": 2.234375, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 4.3463, "loss/crossentropy": 2.3490394353866577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26400282233953476, "step": 5958 }, { "epoch": 0.1192, "grad_norm": 2.296875, "grad_norm_var": 0.008576456705729167, "learning_rate": 0.0001, "loss": 4.3629, "loss/crossentropy": 2.145757555961609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23329483717679977, "step": 5960 }, { "epoch": 0.11924, "grad_norm": 2.1875, "grad_norm_var": 0.005399576822916667, "learning_rate": 0.0001, "loss": 4.2376, "loss/crossentropy": 2.0764617919921875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22371648252010345, "step": 5962 }, { "epoch": 0.11928, "grad_norm": 2.46875, "grad_norm_var": 0.010380045572916666, "learning_rate": 0.0001, "loss": 4.8106, "loss/crossentropy": 2.199389696121216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23210449516773224, "step": 5964 }, { "epoch": 0.11932, "grad_norm": 2.09375, "grad_norm_var": 0.01064453125, "learning_rate": 0.0001, "loss": 4.3515, "loss/crossentropy": 1.9008439183235168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474837511777878, "step": 5966 }, { "epoch": 0.11936, "grad_norm": 2.0, "grad_norm_var": 0.0148590087890625, "learning_rate": 0.0001, "loss": 4.4603, "loss/crossentropy": 2.1779539585113525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23485051095485687, "step": 5968 }, { "epoch": 0.1194, "grad_norm": 2.15625, "grad_norm_var": 0.014176432291666667, "learning_rate": 0.0001, "loss": 4.428, "loss/crossentropy": 2.267147421836853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24924689531326294, "step": 5970 }, { "epoch": 0.11944, "grad_norm": 2.1875, "grad_norm_var": 0.016292317708333334, "learning_rate": 0.0001, "loss": 4.0856, "loss/crossentropy": 2.0618110299110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24352984875440598, "step": 5972 }, { "epoch": 0.11948, "grad_norm": 2.34375, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 4.5802, "loss/crossentropy": 2.1419676542282104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23680058121681213, "step": 5974 }, { "epoch": 0.11952, "grad_norm": 2.265625, "grad_norm_var": 0.023368326822916667, "learning_rate": 0.0001, "loss": 4.6554, "loss/crossentropy": 2.0376622080802917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22515031695365906, "step": 5976 }, { "epoch": 0.11956, "grad_norm": 2.46875, "grad_norm_var": 0.025739542643229165, "learning_rate": 0.0001, "loss": 4.4213, "loss/crossentropy": 1.7675965428352356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219834603369236, "step": 5978 }, { "epoch": 0.1196, "grad_norm": 2.1875, "grad_norm_var": 0.024030558268229165, "learning_rate": 0.0001, "loss": 4.2886, "loss/crossentropy": 1.8919905424118042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2256327122449875, "step": 5980 }, { "epoch": 0.11964, "grad_norm": 2.15625, "grad_norm_var": 0.0228424072265625, "learning_rate": 0.0001, "loss": 4.6148, "loss/crossentropy": 2.287980794906616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25571687519550323, "step": 5982 }, { "epoch": 0.11968, "grad_norm": 2.15625, "grad_norm_var": 0.0169830322265625, "learning_rate": 0.0001, "loss": 4.3527, "loss/crossentropy": 2.1030094027519226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23401429504156113, "step": 5984 }, { "epoch": 0.11972, "grad_norm": 2.421875, "grad_norm_var": 0.0166412353515625, "learning_rate": 0.0001, "loss": 4.8857, "loss/crossentropy": 2.469533920288086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2548582851886749, "step": 5986 }, { "epoch": 0.11976, "grad_norm": 2.328125, "grad_norm_var": 0.018047841389973958, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 2.140324354171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22603372484445572, "step": 5988 }, { "epoch": 0.1198, "grad_norm": 2.140625, "grad_norm_var": 0.015773264567057292, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 2.1848061084747314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22712922096252441, "step": 5990 }, { "epoch": 0.11984, "grad_norm": 2.34375, "grad_norm_var": 0.015380605061848959, "learning_rate": 0.0001, "loss": 4.6457, "loss/crossentropy": 2.2872358560562134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352529615163803, "step": 5992 }, { "epoch": 0.11988, "grad_norm": 2.25, "grad_norm_var": 0.012318674723307292, "learning_rate": 0.0001, "loss": 4.6136, "loss/crossentropy": 2.082811713218689, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24107889831066132, "step": 5994 }, { "epoch": 0.11992, "grad_norm": 2.109375, "grad_norm_var": 0.013602447509765626, "learning_rate": 0.0001, "loss": 4.2089, "loss/crossentropy": 2.23664391040802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23429522663354874, "step": 5996 }, { "epoch": 0.11996, "grad_norm": 2.265625, "grad_norm_var": 0.013242340087890625, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 2.451270341873169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2541816979646683, "step": 5998 }, { "epoch": 0.12, "grad_norm": 2.109375, "grad_norm_var": 0.013561757405598958, "learning_rate": 0.0001, "loss": 4.5559, "loss/crossentropy": 2.1744157671928406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22349119931459427, "step": 6000 }, { "epoch": 0.12004, "grad_norm": 2.09375, "grad_norm_var": 0.013171132405598958, "learning_rate": 0.0001, "loss": 4.5532, "loss/crossentropy": 2.0316836833953857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259635865688324, "step": 6002 }, { "epoch": 0.12008, "grad_norm": 2.609375, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 4.3674, "loss/crossentropy": 2.0989437103271484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25056491047143936, "step": 6004 }, { "epoch": 0.12012, "grad_norm": 2.5625, "grad_norm_var": 0.02633056640625, "learning_rate": 0.0001, "loss": 4.0883, "loss/crossentropy": 1.9609100818634033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22969383746385574, "step": 6006 }, { "epoch": 0.12016, "grad_norm": 2.15625, "grad_norm_var": 0.027665201822916666, "learning_rate": 0.0001, "loss": 4.2094, "loss/crossentropy": 2.077883243560791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22850078344345093, "step": 6008 }, { "epoch": 0.1202, "grad_norm": 2.34375, "grad_norm_var": 1.6465779622395833, "learning_rate": 0.0001, "loss": 4.5659, "loss/crossentropy": 1.7967591285705566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126556932926178, "step": 6010 }, { "epoch": 0.12024, "grad_norm": 2.234375, "grad_norm_var": 1.62222900390625, "learning_rate": 0.0001, "loss": 4.4018, "loss/crossentropy": 1.6516226530075073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188895046710968, "step": 6012 }, { "epoch": 0.12028, "grad_norm": 2.21875, "grad_norm_var": 1.62984619140625, "learning_rate": 0.0001, "loss": 4.3303, "loss/crossentropy": 2.161388635635376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23447516560554504, "step": 6014 }, { "epoch": 0.12032, "grad_norm": 2.328125, "grad_norm_var": 1.6235636393229167, "learning_rate": 0.0001, "loss": 4.5112, "loss/crossentropy": 1.9205461740493774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797179013490677, "step": 6016 }, { "epoch": 0.12036, "grad_norm": 2.3125, "grad_norm_var": 1.616844685872396, "learning_rate": 0.0001, "loss": 4.8019, "loss/crossentropy": 2.025223135948181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25120896100997925, "step": 6018 }, { "epoch": 0.1204, "grad_norm": 2.25, "grad_norm_var": 1.6347005208333334, "learning_rate": 0.0001, "loss": 4.4819, "loss/crossentropy": 1.957942008972168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22228525578975677, "step": 6020 }, { "epoch": 0.12044, "grad_norm": 2.171875, "grad_norm_var": 1.674201456705729, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.3325445652008057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26250994950532913, "step": 6022 }, { "epoch": 0.12048, "grad_norm": 2.03125, "grad_norm_var": 1.6957997639973958, "learning_rate": 0.0001, "loss": 4.264, "loss/crossentropy": 2.0049667954444885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22990728169679642, "step": 6024 }, { "epoch": 0.12052, "grad_norm": 2.265625, "grad_norm_var": 0.08680013020833334, "learning_rate": 0.0001, "loss": 4.5996, "loss/crossentropy": 2.0473387241363525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23628919571638107, "step": 6026 }, { "epoch": 0.12056, "grad_norm": 1.9765625, "grad_norm_var": 0.09107640584309896, "learning_rate": 0.0001, "loss": 4.109, "loss/crossentropy": 2.013141930103302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21728236973285675, "step": 6028 }, { "epoch": 0.1206, "grad_norm": 2.109375, "grad_norm_var": 0.09145278930664062, "learning_rate": 0.0001, "loss": 4.3869, "loss/crossentropy": 2.1269132494926453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25139085948467255, "step": 6030 }, { "epoch": 0.12064, "grad_norm": 2.234375, "grad_norm_var": 0.09058405558268229, "learning_rate": 0.0001, "loss": 4.5568, "loss/crossentropy": 2.5267512798309326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27998340129852295, "step": 6032 }, { "epoch": 0.12068, "grad_norm": 2.78125, "grad_norm_var": 0.10746027628580729, "learning_rate": 0.0001, "loss": 4.2615, "loss/crossentropy": 1.8502249717712402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22863731533288956, "step": 6034 }, { "epoch": 0.12072, "grad_norm": 2.34375, "grad_norm_var": 0.10850601196289063, "learning_rate": 0.0001, "loss": 4.2197, "loss/crossentropy": 1.7754456400871277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23809141665697098, "step": 6036 }, { "epoch": 0.12076, "grad_norm": 2.203125, "grad_norm_var": 0.033760325113932295, "learning_rate": 0.0001, "loss": 4.4714, "loss/crossentropy": 1.9596920609474182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111937776207924, "step": 6038 }, { "epoch": 0.1208, "grad_norm": 2.203125, "grad_norm_var": 0.02995580037434896, "learning_rate": 0.0001, "loss": 4.3079, "loss/crossentropy": 1.888563334941864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21698038280010223, "step": 6040 }, { "epoch": 0.12084, "grad_norm": 2.296875, "grad_norm_var": 0.031404368082682294, "learning_rate": 0.0001, "loss": 4.2412, "loss/crossentropy": 2.1646993160247803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22735021263360977, "step": 6042 }, { "epoch": 0.12088, "grad_norm": 2.1875, "grad_norm_var": 0.0258941650390625, "learning_rate": 0.0001, "loss": 4.525, "loss/crossentropy": 2.0790343284606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193024456501007, "step": 6044 }, { "epoch": 0.12092, "grad_norm": 2.109375, "grad_norm_var": 0.025211588541666666, "learning_rate": 0.0001, "loss": 4.3538, "loss/crossentropy": 2.2733768224716187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.249516561627388, "step": 6046 }, { "epoch": 0.12096, "grad_norm": 2.21875, "grad_norm_var": 0.02431640625, "learning_rate": 0.0001, "loss": 4.7286, "loss/crossentropy": 2.5003533363342285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22928690910339355, "step": 6048 }, { "epoch": 0.121, "grad_norm": 2.453125, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 4.6269, "loss/crossentropy": 1.9606900215148926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152085080742836, "step": 6050 }, { "epoch": 0.12104, "grad_norm": 2.390625, "grad_norm_var": 0.0384765625, "learning_rate": 0.0001, "loss": 4.5724, "loss/crossentropy": 2.266395926475525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26055608689785004, "step": 6052 }, { "epoch": 0.12108, "grad_norm": 2.296875, "grad_norm_var": 0.037821451822916664, "learning_rate": 0.0001, "loss": 4.5841, "loss/crossentropy": 2.1753041744232178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327822595834732, "step": 6054 }, { "epoch": 0.12112, "grad_norm": 2.15625, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 4.2198, "loss/crossentropy": 1.775630235671997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073052078485489, "step": 6056 }, { "epoch": 0.12116, "grad_norm": 2.21875, "grad_norm_var": 0.03975321451822917, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 2.415855050086975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.270721398293972, "step": 6058 }, { "epoch": 0.1212, "grad_norm": 2.34375, "grad_norm_var": 0.039453125, "learning_rate": 0.0001, "loss": 4.6647, "loss/crossentropy": 2.2122162580490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24723708629608154, "step": 6060 }, { "epoch": 0.12124, "grad_norm": 2.375, "grad_norm_var": 0.041792805989583334, "learning_rate": 0.0001, "loss": 4.3803, "loss/crossentropy": 1.9540830850601196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21385761350393295, "step": 6062 }, { "epoch": 0.12128, "grad_norm": 2.34375, "grad_norm_var": 0.0413726806640625, "learning_rate": 0.0001, "loss": 4.979, "loss/crossentropy": 1.923313319683075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22986505925655365, "step": 6064 }, { "epoch": 0.12132, "grad_norm": 3.03125, "grad_norm_var": 0.05182291666666667, "learning_rate": 0.0001, "loss": 4.6895, "loss/crossentropy": 2.4042444229125977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30044034123420715, "step": 6066 }, { "epoch": 0.12136, "grad_norm": 2.25, "grad_norm_var": 0.049225870768229166, "learning_rate": 0.0001, "loss": 4.5857, "loss/crossentropy": 2.2610549926757812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628016769886017, "step": 6068 }, { "epoch": 0.1214, "grad_norm": 2.140625, "grad_norm_var": 0.0513671875, "learning_rate": 0.0001, "loss": 4.5024, "loss/crossentropy": 1.9100797176361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22663169354200363, "step": 6070 }, { "epoch": 0.12144, "grad_norm": 2.34375, "grad_norm_var": 0.04830729166666667, "learning_rate": 0.0001, "loss": 4.5916, "loss/crossentropy": 2.3971651792526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26831263303756714, "step": 6072 }, { "epoch": 0.12148, "grad_norm": 2.453125, "grad_norm_var": 0.0492095947265625, "learning_rate": 0.0001, "loss": 4.4553, "loss/crossentropy": 2.106821596622467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23786011338233948, "step": 6074 }, { "epoch": 0.12152, "grad_norm": 2.125, "grad_norm_var": 0.052469889322916664, "learning_rate": 0.0001, "loss": 4.4225, "loss/crossentropy": 2.1920535564422607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23812127113342285, "step": 6076 }, { "epoch": 0.12156, "grad_norm": 2.203125, "grad_norm_var": 0.049088541666666666, "learning_rate": 0.0001, "loss": 4.2314, "loss/crossentropy": 2.3014419078826904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2380141019821167, "step": 6078 }, { "epoch": 0.1216, "grad_norm": 2.265625, "grad_norm_var": 0.04903055826822917, "learning_rate": 0.0001, "loss": 4.4558, "loss/crossentropy": 2.1721781492233276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24158813059329987, "step": 6080 }, { "epoch": 0.12164, "grad_norm": 2.21875, "grad_norm_var": 0.013232421875, "learning_rate": 0.0001, "loss": 4.5192, "loss/crossentropy": 2.1230934858322144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23764102160930634, "step": 6082 }, { "epoch": 0.12168, "grad_norm": 2.296875, "grad_norm_var": 0.0166412353515625, "learning_rate": 0.0001, "loss": 4.2147, "loss/crossentropy": 1.9570311307907104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24687693268060684, "step": 6084 }, { "epoch": 0.12172, "grad_norm": 2.15625, "grad_norm_var": 0.014997355143229167, "learning_rate": 0.0001, "loss": 4.2063, "loss/crossentropy": 2.272383213043213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2567671462893486, "step": 6086 }, { "epoch": 0.12176, "grad_norm": 2.1875, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.645, "loss/crossentropy": 2.216492176055908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24377377331256866, "step": 6088 }, { "epoch": 0.1218, "grad_norm": 2.140625, "grad_norm_var": 0.007201131184895833, "learning_rate": 0.0001, "loss": 4.5956, "loss/crossentropy": 2.25177001953125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23176074773073196, "step": 6090 }, { "epoch": 0.12184, "grad_norm": 2.21875, "grad_norm_var": 0.009098307291666666, "learning_rate": 0.0001, "loss": 4.1966, "loss/crossentropy": 2.2452452182769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24528006464242935, "step": 6092 }, { "epoch": 0.12188, "grad_norm": 2.25, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.3548, "loss/crossentropy": 2.078445553779602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276337668299675, "step": 6094 }, { "epoch": 0.12192, "grad_norm": 2.125, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 4.4435, "loss/crossentropy": 2.2938032150268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2784377336502075, "step": 6096 }, { "epoch": 0.12196, "grad_norm": 2.015625, "grad_norm_var": 0.0136627197265625, "learning_rate": 0.0001, "loss": 4.1588, "loss/crossentropy": 2.028180480003357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23716723918914795, "step": 6098 }, { "epoch": 0.122, "grad_norm": 2.609375, "grad_norm_var": 0.020580037434895834, "learning_rate": 0.0001, "loss": 4.6345, "loss/crossentropy": 2.4959323406219482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2510451450943947, "step": 6100 }, { "epoch": 0.12204, "grad_norm": 2.21875, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 4.6639, "loss/crossentropy": 2.186043620109558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23511512577533722, "step": 6102 }, { "epoch": 0.12208, "grad_norm": 2.359375, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 4.6727, "loss/crossentropy": 2.6631078720092773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25689610838890076, "step": 6104 }, { "epoch": 0.12212, "grad_norm": 2.453125, "grad_norm_var": 0.025609334309895832, "learning_rate": 0.0001, "loss": 4.7943, "loss/crossentropy": 2.310486674308777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26937828958034515, "step": 6106 }, { "epoch": 0.12216, "grad_norm": 2.359375, "grad_norm_var": 0.0247711181640625, "learning_rate": 0.0001, "loss": 4.9892, "loss/crossentropy": 2.2036240100860596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24968907237052917, "step": 6108 }, { "epoch": 0.1222, "grad_norm": 2.984375, "grad_norm_var": 0.058251953125, "learning_rate": 0.0001, "loss": 4.3527, "loss/crossentropy": 1.9434874057769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23247701674699783, "step": 6110 }, { "epoch": 0.12224, "grad_norm": 2.296875, "grad_norm_var": 0.05579020182291667, "learning_rate": 0.0001, "loss": 4.4149, "loss/crossentropy": 2.0525330305099487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24810528755187988, "step": 6112 }, { "epoch": 0.12228, "grad_norm": 2.328125, "grad_norm_var": 0.04482320149739583, "learning_rate": 0.0001, "loss": 4.5205, "loss/crossentropy": 2.0085532665252686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2514277398586273, "step": 6114 }, { "epoch": 0.12232, "grad_norm": 2.328125, "grad_norm_var": 0.04120686848958333, "learning_rate": 0.0001, "loss": 4.5745, "loss/crossentropy": 1.7393967509269714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239851951599121, "step": 6116 }, { "epoch": 0.12236, "grad_norm": 2.546875, "grad_norm_var": 0.04327799479166667, "learning_rate": 0.0001, "loss": 4.7332, "loss/crossentropy": 1.8714343905448914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22823140025138855, "step": 6118 }, { "epoch": 0.1224, "grad_norm": 2.3125, "grad_norm_var": 0.04192708333333333, "learning_rate": 0.0001, "loss": 4.6177, "loss/crossentropy": 1.9353562593460083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22261886298656464, "step": 6120 }, { "epoch": 0.12244, "grad_norm": 2.171875, "grad_norm_var": 0.0469635009765625, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.0409420132637024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319282591342926, "step": 6122 }, { "epoch": 0.12248, "grad_norm": 2.125, "grad_norm_var": 0.04882405598958333, "learning_rate": 0.0001, "loss": 4.4764, "loss/crossentropy": 2.2461307048797607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25047267973423004, "step": 6124 }, { "epoch": 0.12252, "grad_norm": 2.203125, "grad_norm_var": 0.014012654622395834, "learning_rate": 0.0001, "loss": 4.4863, "loss/crossentropy": 2.1060246229171753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24702349305152893, "step": 6126 }, { "epoch": 0.12256, "grad_norm": 2.21875, "grad_norm_var": 0.01718724568684896, "learning_rate": 0.0001, "loss": 3.9093, "loss/crossentropy": 1.7481068968772888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19611438363790512, "step": 6128 }, { "epoch": 0.1226, "grad_norm": 2.21875, "grad_norm_var": 0.017329661051432292, "learning_rate": 0.0001, "loss": 4.5506, "loss/crossentropy": 1.8341861963272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20660528540611267, "step": 6130 }, { "epoch": 0.12264, "grad_norm": 2.421875, "grad_norm_var": 0.018507639567057293, "learning_rate": 0.0001, "loss": 4.3388, "loss/crossentropy": 1.7332024574279785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21615543216466904, "step": 6132 }, { "epoch": 0.12268, "grad_norm": 2.078125, "grad_norm_var": 0.013203684488932292, "learning_rate": 0.0001, "loss": 4.3027, "loss/crossentropy": 1.8959643244743347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219951793551445, "step": 6134 }, { "epoch": 0.12272, "grad_norm": 2.265625, "grad_norm_var": 0.012601470947265625, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 2.234723210334778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2342153787612915, "step": 6136 }, { "epoch": 0.12276, "grad_norm": 2.3125, "grad_norm_var": 0.011637115478515625, "learning_rate": 0.0001, "loss": 4.7394, "loss/crossentropy": 2.2297592759132385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24097590148448944, "step": 6138 }, { "epoch": 0.1228, "grad_norm": 2.40625, "grad_norm_var": 0.013586171468098958, "learning_rate": 0.0001, "loss": 4.592, "loss/crossentropy": 2.013442814350128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532622739672661, "step": 6140 }, { "epoch": 0.12284, "grad_norm": 2.15625, "grad_norm_var": 0.014833323160807292, "learning_rate": 0.0001, "loss": 4.417, "loss/crossentropy": 2.3066656589508057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24480555206537247, "step": 6142 }, { "epoch": 0.12288, "grad_norm": 2.4375, "grad_norm_var": 0.014241536458333334, "learning_rate": 0.0001, "loss": 4.4156, "loss/crossentropy": 1.9281310439109802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22363708168268204, "step": 6144 }, { "epoch": 0.12292, "grad_norm": 2.078125, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 4.4314, "loss/crossentropy": 1.965875267982483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22153983265161514, "step": 6146 }, { "epoch": 0.12296, "grad_norm": 2.8125, "grad_norm_var": 0.03536783854166667, "learning_rate": 0.0001, "loss": 4.6139, "loss/crossentropy": 1.811126947402954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21779045462608337, "step": 6148 }, { "epoch": 0.123, "grad_norm": 2.34375, "grad_norm_var": 0.0311676025390625, "learning_rate": 0.0001, "loss": 4.4855, "loss/crossentropy": 2.5114762783050537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542252689599991, "step": 6150 }, { "epoch": 0.12304, "grad_norm": 2.296875, "grad_norm_var": 0.028271484375, "learning_rate": 0.0001, "loss": 4.1269, "loss/crossentropy": 1.8784565925598145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24637068808078766, "step": 6152 }, { "epoch": 0.12308, "grad_norm": 2.234375, "grad_norm_var": 0.028685506184895834, "learning_rate": 0.0001, "loss": 4.3377, "loss/crossentropy": 1.6900760531425476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170872688293457, "step": 6154 }, { "epoch": 0.12312, "grad_norm": 2.21875, "grad_norm_var": 0.02867431640625, "learning_rate": 0.0001, "loss": 4.5705, "loss/crossentropy": 2.413783550262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2624947279691696, "step": 6156 }, { "epoch": 0.12316, "grad_norm": 2.03125, "grad_norm_var": 0.030126953125, "learning_rate": 0.0001, "loss": 4.3463, "loss/crossentropy": 2.0598058104515076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071695551276207, "step": 6158 }, { "epoch": 0.1232, "grad_norm": 2.375, "grad_norm_var": 0.029781087239583334, "learning_rate": 0.0001, "loss": 4.6702, "loss/crossentropy": 2.0407246947288513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24353720247745514, "step": 6160 }, { "epoch": 0.12324, "grad_norm": 2.171875, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 4.3116, "loss/crossentropy": 1.9608840346336365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20965111255645752, "step": 6162 }, { "epoch": 0.12328, "grad_norm": 2.046875, "grad_norm_var": 0.01640625, "learning_rate": 0.0001, "loss": 4.2146, "loss/crossentropy": 2.0231454372406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219897098839283, "step": 6164 }, { "epoch": 0.12332, "grad_norm": 2.25, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 4.614, "loss/crossentropy": 2.217663288116455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2466331645846367, "step": 6166 }, { "epoch": 0.12336, "grad_norm": 2.234375, "grad_norm_var": 0.01597900390625, "learning_rate": 0.0001, "loss": 4.4135, "loss/crossentropy": 1.6825732588768005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999632865190506, "step": 6168 }, { "epoch": 0.1234, "grad_norm": 2.5625, "grad_norm_var": 0.020897420247395833, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 2.288950800895691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26286834478378296, "step": 6170 }, { "epoch": 0.12344, "grad_norm": 2.453125, "grad_norm_var": 0.023225911458333335, "learning_rate": 0.0001, "loss": 4.3376, "loss/crossentropy": 1.8202561140060425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22252517193555832, "step": 6172 }, { "epoch": 0.12348, "grad_norm": 2.3125, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 4.5023, "loss/crossentropy": 2.1007159948349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24983422458171844, "step": 6174 }, { "epoch": 0.12352, "grad_norm": 2.171875, "grad_norm_var": 0.0199371337890625, "learning_rate": 0.0001, "loss": 4.4513, "loss/crossentropy": 2.0953084230422974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25545743107795715, "step": 6176 }, { "epoch": 0.12356, "grad_norm": 2.171875, "grad_norm_var": 0.019001261393229166, "learning_rate": 0.0001, "loss": 4.4315, "loss/crossentropy": 2.107246518135071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22741339355707169, "step": 6178 }, { "epoch": 0.1236, "grad_norm": 2.265625, "grad_norm_var": 0.013118489583333334, "learning_rate": 0.0001, "loss": 4.4982, "loss/crossentropy": 2.1219626665115356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22499094158411026, "step": 6180 }, { "epoch": 0.12364, "grad_norm": 2.25, "grad_norm_var": 0.013093058268229167, "learning_rate": 0.0001, "loss": 4.4263, "loss/crossentropy": 1.8892266154289246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255062311887741, "step": 6182 }, { "epoch": 0.12368, "grad_norm": 2.21875, "grad_norm_var": 0.015315755208333334, "learning_rate": 0.0001, "loss": 4.5884, "loss/crossentropy": 2.0765860080718994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24712087213993073, "step": 6184 }, { "epoch": 0.12372, "grad_norm": 2.296875, "grad_norm_var": 0.010074869791666666, "learning_rate": 0.0001, "loss": 4.5431, "loss/crossentropy": 2.301337718963623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24419523775577545, "step": 6186 }, { "epoch": 0.12376, "grad_norm": 2.359375, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 4.5043, "loss/crossentropy": 2.0489712953567505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25768278539180756, "step": 6188 }, { "epoch": 0.1238, "grad_norm": 2.140625, "grad_norm_var": 0.0128082275390625, "learning_rate": 0.0001, "loss": 4.2654, "loss/crossentropy": 1.9328826069831848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22916094958782196, "step": 6190 }, { "epoch": 0.12384, "grad_norm": 2.25, "grad_norm_var": 0.01171875, "learning_rate": 0.0001, "loss": 4.1574, "loss/crossentropy": 1.6330446004867554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20600398629903793, "step": 6192 }, { "epoch": 0.12388, "grad_norm": 2.234375, "grad_norm_var": 0.0111724853515625, "learning_rate": 0.0001, "loss": 4.4769, "loss/crossentropy": 1.988203227519989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22711507230997086, "step": 6194 }, { "epoch": 0.12392, "grad_norm": 1.9609375, "grad_norm_var": 0.01590143839518229, "learning_rate": 0.0001, "loss": 4.125, "loss/crossentropy": 2.036049246788025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110549360513687, "step": 6196 }, { "epoch": 0.12396, "grad_norm": 2.265625, "grad_norm_var": 0.01624120076497396, "learning_rate": 0.0001, "loss": 4.5546, "loss/crossentropy": 2.217681884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23705793917179108, "step": 6198 }, { "epoch": 0.124, "grad_norm": 2.234375, "grad_norm_var": 0.015276845296223958, "learning_rate": 0.0001, "loss": 4.4803, "loss/crossentropy": 2.3878824710845947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24000737071037292, "step": 6200 }, { "epoch": 0.12404, "grad_norm": 2.265625, "grad_norm_var": 0.015852610270182293, "learning_rate": 0.0001, "loss": 4.4754, "loss/crossentropy": 2.2790249586105347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26150786131620407, "step": 6202 }, { "epoch": 0.12408, "grad_norm": 2.1875, "grad_norm_var": 0.015036773681640626, "learning_rate": 0.0001, "loss": 4.4703, "loss/crossentropy": 2.251123785972595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25231797993183136, "step": 6204 }, { "epoch": 0.12412, "grad_norm": 2.171875, "grad_norm_var": 0.010802968343098959, "learning_rate": 0.0001, "loss": 4.5294, "loss/crossentropy": 1.8977670073509216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21783004701137543, "step": 6206 }, { "epoch": 0.12416, "grad_norm": 2.265625, "grad_norm_var": 0.012894439697265624, "learning_rate": 0.0001, "loss": 4.6458, "loss/crossentropy": 2.385319232940674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29223111271858215, "step": 6208 }, { "epoch": 0.1242, "grad_norm": 2.40625, "grad_norm_var": 0.015964508056640625, "learning_rate": 0.0001, "loss": 4.6555, "loss/crossentropy": 1.9274529218673706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22827968001365662, "step": 6210 }, { "epoch": 0.12424, "grad_norm": 2.40625, "grad_norm_var": 0.01226806640625, "learning_rate": 0.0001, "loss": 4.8232, "loss/crossentropy": 2.1861478090286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2424854040145874, "step": 6212 }, { "epoch": 0.12428, "grad_norm": 2.171875, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 4.3002, "loss/crossentropy": 2.2234357595443726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2471691370010376, "step": 6214 }, { "epoch": 0.12432, "grad_norm": 2.625, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 4.5216, "loss/crossentropy": 2.365513563156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2640291824936867, "step": 6216 }, { "epoch": 0.12436, "grad_norm": 2.21875, "grad_norm_var": 0.021955362955729165, "learning_rate": 0.0001, "loss": 4.282, "loss/crossentropy": 1.964316964149475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341662645339966, "step": 6218 }, { "epoch": 0.1244, "grad_norm": 2.3125, "grad_norm_var": 0.021805826822916666, "learning_rate": 0.0001, "loss": 4.7078, "loss/crossentropy": 2.3704408407211304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24599966406822205, "step": 6220 }, { "epoch": 0.12444, "grad_norm": 2.1875, "grad_norm_var": 0.0193756103515625, "learning_rate": 0.0001, "loss": 4.7558, "loss/crossentropy": 2.1461408138275146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25205330550670624, "step": 6222 }, { "epoch": 0.12448, "grad_norm": 2.21875, "grad_norm_var": 0.018745930989583333, "learning_rate": 0.0001, "loss": 4.7032, "loss/crossentropy": 2.3997104167938232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23371660709381104, "step": 6224 }, { "epoch": 0.12452, "grad_norm": 2.09375, "grad_norm_var": 0.0212890625, "learning_rate": 0.0001, "loss": 4.1842, "loss/crossentropy": 1.7976875305175781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21662414073944092, "step": 6226 }, { "epoch": 0.12456, "grad_norm": 2.140625, "grad_norm_var": 0.02008056640625, "learning_rate": 0.0001, "loss": 4.2214, "loss/crossentropy": 1.9929583668708801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21378497034311295, "step": 6228 }, { "epoch": 0.1246, "grad_norm": 2.21875, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 4.2982, "loss/crossentropy": 1.8853623867034912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21267645806074142, "step": 6230 }, { "epoch": 0.12464, "grad_norm": 2.234375, "grad_norm_var": 0.008454386393229167, "learning_rate": 0.0001, "loss": 4.2638, "loss/crossentropy": 2.2534812688827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23003943264484406, "step": 6232 }, { "epoch": 0.12468, "grad_norm": 2.0, "grad_norm_var": 0.010863240559895833, "learning_rate": 0.0001, "loss": 4.2504, "loss/crossentropy": 2.026564121246338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172931283712387, "step": 6234 }, { "epoch": 0.12472, "grad_norm": 2.25, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 4.3555, "loss/crossentropy": 1.9323118925094604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22170638293027878, "step": 6236 }, { "epoch": 0.12476, "grad_norm": 2.296875, "grad_norm_var": 0.007938639322916666, "learning_rate": 0.0001, "loss": 4.479, "loss/crossentropy": 2.056011915206909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2656880244612694, "step": 6238 }, { "epoch": 0.1248, "grad_norm": 2.296875, "grad_norm_var": 0.00830078125, "learning_rate": 0.0001, "loss": 4.6288, "loss/crossentropy": 2.095108926296234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23341026157140732, "step": 6240 }, { "epoch": 0.12484, "grad_norm": 2.4375, "grad_norm_var": 0.011555989583333334, "learning_rate": 0.0001, "loss": 4.737, "loss/crossentropy": 2.0198334455490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24095112830400467, "step": 6242 }, { "epoch": 0.12488, "grad_norm": 2.09375, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 4.3745, "loss/crossentropy": 2.0397544503211975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22567399591207504, "step": 6244 }, { "epoch": 0.12492, "grad_norm": 2.1875, "grad_norm_var": 0.0120513916015625, "learning_rate": 0.0001, "loss": 4.3633, "loss/crossentropy": 1.9094319343566895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22690637409687042, "step": 6246 }, { "epoch": 0.12496, "grad_norm": 2.265625, "grad_norm_var": 0.011295572916666666, "learning_rate": 0.0001, "loss": 4.6474, "loss/crossentropy": 2.27765429019928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25075703859329224, "step": 6248 }, { "epoch": 0.125, "grad_norm": 2.609375, "grad_norm_var": 0.017023722330729168, "learning_rate": 0.0001, "loss": 4.5769, "loss/crossentropy": 2.0027456283569336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24267998337745667, "step": 6250 }, { "epoch": 0.12504, "grad_norm": 2.359375, "grad_norm_var": 0.017723592122395833, "learning_rate": 0.0001, "loss": 4.5398, "loss/crossentropy": 2.276857614517212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24048637598752975, "step": 6252 }, { "epoch": 0.12508, "grad_norm": 2.390625, "grad_norm_var": 0.018065388997395834, "learning_rate": 0.0001, "loss": 4.7603, "loss/crossentropy": 2.1234214305877686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23481453210115433, "step": 6254 }, { "epoch": 0.12512, "grad_norm": 2.28125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 4.7537, "loss/crossentropy": 2.201158881187439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23473594337701797, "step": 6256 }, { "epoch": 0.12516, "grad_norm": 2.296875, "grad_norm_var": 0.017220052083333333, "learning_rate": 0.0001, "loss": 4.7202, "loss/crossentropy": 2.3437804579734802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24881581962108612, "step": 6258 }, { "epoch": 0.1252, "grad_norm": 2.265625, "grad_norm_var": 0.015461222330729166, "learning_rate": 0.0001, "loss": 4.5938, "loss/crossentropy": 2.0984586477279663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24332843720912933, "step": 6260 }, { "epoch": 0.12524, "grad_norm": 2.390625, "grad_norm_var": 0.016942342122395832, "learning_rate": 0.0001, "loss": 4.2317, "loss/crossentropy": 1.7946080565452576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21510492265224457, "step": 6262 }, { "epoch": 0.12528, "grad_norm": 2.171875, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.0001, "loss": 4.3889, "loss/crossentropy": 2.022711932659149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22890903800725937, "step": 6264 }, { "epoch": 0.12532, "grad_norm": 2.046875, "grad_norm_var": 0.009577433268229166, "learning_rate": 0.0001, "loss": 4.288, "loss/crossentropy": 1.9752087593078613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21972095221281052, "step": 6266 }, { "epoch": 0.12536, "grad_norm": 2.234375, "grad_norm_var": 0.008625284830729166, "learning_rate": 0.0001, "loss": 4.4952, "loss/crossentropy": 1.7267251014709473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305006742477417, "step": 6268 }, { "epoch": 0.1254, "grad_norm": 2.109375, "grad_norm_var": 0.008869425455729166, "learning_rate": 0.0001, "loss": 4.2164, "loss/crossentropy": 2.249786615371704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24875187873840332, "step": 6270 }, { "epoch": 0.12544, "grad_norm": 2.46875, "grad_norm_var": 0.012190755208333333, "learning_rate": 0.0001, "loss": 4.693, "loss/crossentropy": 2.402994990348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25536587834358215, "step": 6272 }, { "epoch": 0.12548, "grad_norm": 2.46875, "grad_norm_var": 0.016243489583333333, "learning_rate": 0.0001, "loss": 4.606, "loss/crossentropy": 2.240913987159729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3013365715742111, "step": 6274 }, { "epoch": 0.12552, "grad_norm": 2.390625, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 4.4691, "loss/crossentropy": 2.0767332911491394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22113215178251266, "step": 6276 }, { "epoch": 0.12556, "grad_norm": 2.140625, "grad_norm_var": 0.02041015625, "learning_rate": 0.0001, "loss": 4.608, "loss/crossentropy": 1.8625048995018005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20321352779865265, "step": 6278 }, { "epoch": 0.1256, "grad_norm": 2.3125, "grad_norm_var": 0.024072265625, "learning_rate": 0.0001, "loss": 4.0386, "loss/crossentropy": 1.9143638610839844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22085773944854736, "step": 6280 }, { "epoch": 0.12564, "grad_norm": 2.34375, "grad_norm_var": 0.023192342122395834, "learning_rate": 0.0001, "loss": 4.372, "loss/crossentropy": 2.3756210803985596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26027245819568634, "step": 6282 }, { "epoch": 0.12568, "grad_norm": 2.21875, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 4.5526, "loss/crossentropy": 1.9310896396636963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086925357580185, "step": 6284 }, { "epoch": 0.12572, "grad_norm": 2.34375, "grad_norm_var": 0.021675618489583333, "learning_rate": 0.0001, "loss": 4.4751, "loss/crossentropy": 2.1757423877716064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.241075336933136, "step": 6286 }, { "epoch": 0.12576, "grad_norm": 2.265625, "grad_norm_var": 0.021284993489583334, "learning_rate": 0.0001, "loss": 4.0946, "loss/crossentropy": 2.3057546615600586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24882248044013977, "step": 6288 }, { "epoch": 0.1258, "grad_norm": 3.140625, "grad_norm_var": 0.06883036295572917, "learning_rate": 0.0001, "loss": 4.5211, "loss/crossentropy": 2.1551633477211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2796258181333542, "step": 6290 }, { "epoch": 0.12584, "grad_norm": 2.296875, "grad_norm_var": 0.06608072916666667, "learning_rate": 0.0001, "loss": 4.4915, "loss/crossentropy": 2.0627459287643433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24781616777181625, "step": 6292 }, { "epoch": 0.12588, "grad_norm": 2.453125, "grad_norm_var": 0.06463216145833334, "learning_rate": 0.0001, "loss": 4.5896, "loss/crossentropy": 1.795321524143219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113223671913147, "step": 6294 }, { "epoch": 0.12592, "grad_norm": 2.171875, "grad_norm_var": 0.06004130045572917, "learning_rate": 0.0001, "loss": 4.4988, "loss/crossentropy": 2.323319673538208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24748852849006653, "step": 6296 }, { "epoch": 0.12596, "grad_norm": 2.296875, "grad_norm_var": 0.05998942057291667, "learning_rate": 0.0001, "loss": 4.4801, "loss/crossentropy": 2.053200662136078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278411090373993, "step": 6298 }, { "epoch": 0.126, "grad_norm": 2.4375, "grad_norm_var": 0.060155232747395836, "learning_rate": 0.0001, "loss": 4.4605, "loss/crossentropy": 2.102539896965027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26071713864803314, "step": 6300 }, { "epoch": 0.12604, "grad_norm": 2.265625, "grad_norm_var": 0.059554036458333334, "learning_rate": 0.0001, "loss": 4.4613, "loss/crossentropy": 1.9316620826721191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23007915914058685, "step": 6302 }, { "epoch": 0.12608, "grad_norm": 2.6875, "grad_norm_var": 0.06288655598958333, "learning_rate": 0.0001, "loss": 4.4194, "loss/crossentropy": 1.9966526627540588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2424854189157486, "step": 6304 }, { "epoch": 0.12612, "grad_norm": 2.046875, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 4.2382, "loss/crossentropy": 1.895868957042694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255951091647148, "step": 6306 }, { "epoch": 0.12616, "grad_norm": 2.171875, "grad_norm_var": 0.023688761393229167, "learning_rate": 0.0001, "loss": 4.7758, "loss/crossentropy": 2.3897345066070557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24461720883846283, "step": 6308 }, { "epoch": 0.1262, "grad_norm": 2.40625, "grad_norm_var": 0.022858683268229166, "learning_rate": 0.0001, "loss": 4.8275, "loss/crossentropy": 2.059292197227478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29264035820961, "step": 6310 }, { "epoch": 0.12624, "grad_norm": 2.390625, "grad_norm_var": 0.0221343994140625, "learning_rate": 0.0001, "loss": 4.5098, "loss/crossentropy": 1.8142406344413757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22127485275268555, "step": 6312 }, { "epoch": 0.12628, "grad_norm": 2.109375, "grad_norm_var": 0.0245025634765625, "learning_rate": 0.0001, "loss": 4.2349, "loss/crossentropy": 2.0186127424240112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217330664396286, "step": 6314 }, { "epoch": 0.12632, "grad_norm": 2.265625, "grad_norm_var": 0.024820963541666668, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 1.9439889192581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108919695019722, "step": 6316 }, { "epoch": 0.12636, "grad_norm": 2.28125, "grad_norm_var": 0.024690755208333335, "learning_rate": 0.0001, "loss": 4.8672, "loss/crossentropy": 2.617791175842285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2756097912788391, "step": 6318 }, { "epoch": 0.1264, "grad_norm": 2.515625, "grad_norm_var": 0.017609659830729166, "learning_rate": 0.0001, "loss": 4.3918, "loss/crossentropy": 1.9074286818504333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22491587698459625, "step": 6320 }, { "epoch": 0.12644, "grad_norm": 2.09375, "grad_norm_var": 0.016331990559895832, "learning_rate": 0.0001, "loss": 4.4304, "loss/crossentropy": 2.1414809226989746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22703612595796585, "step": 6322 }, { "epoch": 0.12648, "grad_norm": 2.28125, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 4.4982, "loss/crossentropy": 2.291784942150116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24490328133106232, "step": 6324 }, { "epoch": 0.12652, "grad_norm": 2.1875, "grad_norm_var": 0.0143218994140625, "learning_rate": 0.0001, "loss": 4.228, "loss/crossentropy": 1.938249409198761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22611317038536072, "step": 6326 }, { "epoch": 0.12656, "grad_norm": 2.03125, "grad_norm_var": 0.0187408447265625, "learning_rate": 0.0001, "loss": 4.4932, "loss/crossentropy": 1.7510024905204773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30439992994070053, "step": 6328 }, { "epoch": 0.1266, "grad_norm": 2.46875, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 4.4351, "loss/crossentropy": 2.3168221712112427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25660980492830276, "step": 6330 }, { "epoch": 0.12664, "grad_norm": 2.046875, "grad_norm_var": 0.024409993489583334, "learning_rate": 0.0001, "loss": 4.5002, "loss/crossentropy": 1.9957427978515625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21420849114656448, "step": 6332 }, { "epoch": 0.12668, "grad_norm": 2.0625, "grad_norm_var": 0.025104777018229166, "learning_rate": 0.0001, "loss": 4.3583, "loss/crossentropy": 2.4371464252471924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576342821121216, "step": 6334 }, { "epoch": 0.12672, "grad_norm": 2.171875, "grad_norm_var": 0.021458943684895832, "learning_rate": 0.0001, "loss": 4.2497, "loss/crossentropy": 1.910677433013916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21360614150762558, "step": 6336 }, { "epoch": 0.12676, "grad_norm": 2.203125, "grad_norm_var": 0.020750935872395834, "learning_rate": 0.0001, "loss": 4.2679, "loss/crossentropy": 2.057362914085388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23079442232847214, "step": 6338 }, { "epoch": 0.1268, "grad_norm": 2.21875, "grad_norm_var": 0.0191070556640625, "learning_rate": 0.0001, "loss": 4.1818, "loss/crossentropy": 2.0249438881874084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24038050323724747, "step": 6340 }, { "epoch": 0.12684, "grad_norm": 2.25, "grad_norm_var": 0.019074503580729166, "learning_rate": 0.0001, "loss": 4.5374, "loss/crossentropy": 2.0371533632278442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355760782957077, "step": 6342 }, { "epoch": 0.12688, "grad_norm": 2.203125, "grad_norm_var": 0.0130523681640625, "learning_rate": 0.0001, "loss": 4.4849, "loss/crossentropy": 2.3101218938827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161705374717712, "step": 6344 }, { "epoch": 0.12692, "grad_norm": 2.25, "grad_norm_var": 0.010261027018229167, "learning_rate": 0.0001, "loss": 4.6399, "loss/crossentropy": 2.1990097761154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2537754699587822, "step": 6346 }, { "epoch": 0.12696, "grad_norm": 2.125, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 4.2697, "loss/crossentropy": 2.1783597469329834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106732353568077, "step": 6348 }, { "epoch": 0.127, "grad_norm": 2.359375, "grad_norm_var": 0.012987263997395833, "learning_rate": 0.0001, "loss": 4.5127, "loss/crossentropy": 2.2316598892211914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23841773718595505, "step": 6350 }, { "epoch": 0.12704, "grad_norm": 2.265625, "grad_norm_var": 0.00943603515625, "learning_rate": 0.0001, "loss": 4.5651, "loss/crossentropy": 2.0329924821853638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22755083441734314, "step": 6352 }, { "epoch": 0.12708, "grad_norm": 2.078125, "grad_norm_var": 0.014525349934895833, "learning_rate": 0.0001, "loss": 4.1888, "loss/crossentropy": 1.9174052476882935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000722259283066, "step": 6354 }, { "epoch": 0.12712, "grad_norm": 2.15625, "grad_norm_var": 0.0154693603515625, "learning_rate": 0.0001, "loss": 4.3321, "loss/crossentropy": 1.968774676322937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194407731294632, "step": 6356 }, { "epoch": 0.12716, "grad_norm": 2.296875, "grad_norm_var": 0.0183013916015625, "learning_rate": 0.0001, "loss": 4.5768, "loss/crossentropy": 2.1767213344573975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24483858048915863, "step": 6358 }, { "epoch": 0.1272, "grad_norm": 2.171875, "grad_norm_var": 0.018391927083333332, "learning_rate": 0.0001, "loss": 4.5063, "loss/crossentropy": 1.8390987515449524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21677500754594803, "step": 6360 }, { "epoch": 0.12724, "grad_norm": 2.203125, "grad_norm_var": 0.016039021809895835, "learning_rate": 0.0001, "loss": 4.2635, "loss/crossentropy": 2.1923086643218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21362057328224182, "step": 6362 }, { "epoch": 0.12728, "grad_norm": 2.25, "grad_norm_var": 0.010347493489583333, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 2.030683994293213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22884567826986313, "step": 6364 }, { "epoch": 0.12732, "grad_norm": 2.203125, "grad_norm_var": 0.00816650390625, "learning_rate": 0.0001, "loss": 4.452, "loss/crossentropy": 2.330837845802307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24233026057481766, "step": 6366 }, { "epoch": 0.12736, "grad_norm": 2.453125, "grad_norm_var": 0.012040201822916667, "learning_rate": 0.0001, "loss": 4.6404, "loss/crossentropy": 2.001612663269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2332654967904091, "step": 6368 }, { "epoch": 0.1274, "grad_norm": 2.25, "grad_norm_var": 0.011750284830729167, "learning_rate": 0.0001, "loss": 4.7101, "loss/crossentropy": 1.9234941601753235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23741928488016129, "step": 6370 }, { "epoch": 0.12744, "grad_norm": 2.078125, "grad_norm_var": 0.012262980143229166, "learning_rate": 0.0001, "loss": 4.2164, "loss/crossentropy": 2.059934139251709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24736596643924713, "step": 6372 }, { "epoch": 0.12748, "grad_norm": 2.125, "grad_norm_var": 0.010798136393229166, "learning_rate": 0.0001, "loss": 4.3916, "loss/crossentropy": 2.25182843208313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24349220097064972, "step": 6374 }, { "epoch": 0.12752, "grad_norm": 2.15625, "grad_norm_var": 0.010888671875, "learning_rate": 0.0001, "loss": 4.3742, "loss/crossentropy": 1.845405638217926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138313353061676, "step": 6376 }, { "epoch": 0.12756, "grad_norm": 2.546875, "grad_norm_var": 0.0164215087890625, "learning_rate": 0.0001, "loss": 4.568, "loss/crossentropy": 1.8833998441696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25304871797561646, "step": 6378 }, { "epoch": 0.1276, "grad_norm": 2.28125, "grad_norm_var": 0.017072550455729165, "learning_rate": 0.0001, "loss": 4.2282, "loss/crossentropy": 2.1066314578056335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2231372445821762, "step": 6380 }, { "epoch": 0.12764, "grad_norm": 2.328125, "grad_norm_var": 0.018561808268229167, "learning_rate": 0.0001, "loss": 4.3381, "loss/crossentropy": 2.024670898914337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22260528802871704, "step": 6382 }, { "epoch": 0.12768, "grad_norm": 2.109375, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 4.8132, "loss/crossentropy": 2.425115466117859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2429298236966133, "step": 6384 }, { "epoch": 0.12772, "grad_norm": 2.109375, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 4.2387, "loss/crossentropy": 2.1847925186157227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495381161570549, "step": 6386 }, { "epoch": 0.12776, "grad_norm": 2.265625, "grad_norm_var": 0.015746053059895834, "learning_rate": 0.0001, "loss": 4.6919, "loss/crossentropy": 2.1463611125946045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2303348332643509, "step": 6388 }, { "epoch": 0.1278, "grad_norm": 2.171875, "grad_norm_var": 0.014839680989583333, "learning_rate": 0.0001, "loss": 4.2714, "loss/crossentropy": 1.9755331873893738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232731431722641, "step": 6390 }, { "epoch": 0.12784, "grad_norm": 2.078125, "grad_norm_var": 0.015973917643229165, "learning_rate": 0.0001, "loss": 4.5255, "loss/crossentropy": 2.0741465091705322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22912710905075073, "step": 6392 }, { "epoch": 0.12788, "grad_norm": 2.203125, "grad_norm_var": 0.009471638997395834, "learning_rate": 0.0001, "loss": 4.3834, "loss/crossentropy": 2.1314439177513123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22910036891698837, "step": 6394 }, { "epoch": 0.12792, "grad_norm": 2.109375, "grad_norm_var": 0.0107086181640625, "learning_rate": 0.0001, "loss": 4.1615, "loss/crossentropy": 1.976862907409668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233707495033741, "step": 6396 }, { "epoch": 0.12796, "grad_norm": 2.171875, "grad_norm_var": 0.010807291666666666, "learning_rate": 0.0001, "loss": 4.3939, "loss/crossentropy": 2.0534666180610657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21424879133701324, "step": 6398 }, { "epoch": 0.128, "grad_norm": 2.109375, "grad_norm_var": 0.008088175455729167, "learning_rate": 0.0001, "loss": 4.6647, "loss/crossentropy": 2.2200992107391357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2836414724588394, "step": 6400 }, { "epoch": 0.12804, "grad_norm": 2.234375, "grad_norm_var": 0.007542928059895833, "learning_rate": 0.0001, "loss": 4.3802, "loss/crossentropy": 1.8642286658287048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438255906105042, "step": 6402 }, { "epoch": 0.12808, "grad_norm": 2.15625, "grad_norm_var": 0.005464680989583333, "learning_rate": 0.0001, "loss": 4.3868, "loss/crossentropy": 1.9571613073349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22038634121418, "step": 6404 }, { "epoch": 0.12812, "grad_norm": 2.125, "grad_norm_var": 0.008980305989583333, "learning_rate": 0.0001, "loss": 4.6888, "loss/crossentropy": 2.0420787930488586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532814294099808, "step": 6406 }, { "epoch": 0.12816, "grad_norm": 2.109375, "grad_norm_var": 0.008349609375, "learning_rate": 0.0001, "loss": 4.3357, "loss/crossentropy": 1.7511736750602722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067284658551216, "step": 6408 }, { "epoch": 0.1282, "grad_norm": 2.21875, "grad_norm_var": 0.0095123291015625, "learning_rate": 0.0001, "loss": 4.5568, "loss/crossentropy": 2.0310762524604797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2763645648956299, "step": 6410 }, { "epoch": 0.12824, "grad_norm": 2.28125, "grad_norm_var": 0.008617146809895834, "learning_rate": 0.0001, "loss": 4.575, "loss/crossentropy": 2.112699866294861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2409205138683319, "step": 6412 }, { "epoch": 0.12828, "grad_norm": 2.296875, "grad_norm_var": 0.14166259765625, "learning_rate": 0.0001, "loss": 4.3023, "loss/crossentropy": 1.8243904113769531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079915553331375, "step": 6414 }, { "epoch": 0.12832, "grad_norm": 2.125, "grad_norm_var": 0.14143473307291668, "learning_rate": 0.0001, "loss": 4.5395, "loss/crossentropy": 1.8310211896896362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21304991096258163, "step": 6416 }, { "epoch": 0.12836, "grad_norm": 2.140625, "grad_norm_var": 0.14011128743489584, "learning_rate": 0.0001, "loss": 4.3806, "loss/crossentropy": 1.9653990268707275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22207710891962051, "step": 6418 }, { "epoch": 0.1284, "grad_norm": 2.15625, "grad_norm_var": 0.13982645670572916, "learning_rate": 0.0001, "loss": 4.5869, "loss/crossentropy": 2.0583431124687195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23768695443868637, "step": 6420 }, { "epoch": 0.12844, "grad_norm": 2.296875, "grad_norm_var": 0.1391998291015625, "learning_rate": 0.0001, "loss": 4.6512, "loss/crossentropy": 2.1162279844284058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23963302373886108, "step": 6422 }, { "epoch": 0.12848, "grad_norm": 2.28125, "grad_norm_var": 0.13347066243489583, "learning_rate": 0.0001, "loss": 4.6268, "loss/crossentropy": 2.068794012069702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21967273205518723, "step": 6424 }, { "epoch": 0.12852, "grad_norm": 2.109375, "grad_norm_var": 0.13585611979166667, "learning_rate": 0.0001, "loss": 4.4004, "loss/crossentropy": 2.165997266769409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22991405427455902, "step": 6426 }, { "epoch": 0.12856, "grad_norm": 2.25, "grad_norm_var": 0.14031473795572916, "learning_rate": 0.0001, "loss": 4.1816, "loss/crossentropy": 1.9116491675376892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22784583270549774, "step": 6428 }, { "epoch": 0.1286, "grad_norm": 2.296875, "grad_norm_var": 0.0087890625, "learning_rate": 0.0001, "loss": 4.5268, "loss/crossentropy": 2.296347498893738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2618508040904999, "step": 6430 }, { "epoch": 0.12864, "grad_norm": 2.125, "grad_norm_var": 0.0100250244140625, "learning_rate": 0.0001, "loss": 4.0363, "loss/crossentropy": 2.3961373567581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27667778730392456, "step": 6432 }, { "epoch": 0.12868, "grad_norm": 2.171875, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.3993, "loss/crossentropy": 1.9141033291816711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22818633913993835, "step": 6434 }, { "epoch": 0.12872, "grad_norm": 2.15625, "grad_norm_var": 0.0070149739583333336, "learning_rate": 0.0001, "loss": 4.481, "loss/crossentropy": 2.4431718587875366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24852856248617172, "step": 6436 }, { "epoch": 0.12876, "grad_norm": 2.171875, "grad_norm_var": 0.006494140625, "learning_rate": 0.0001, "loss": 4.5153, "loss/crossentropy": 1.9343949556350708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22181283682584763, "step": 6438 }, { "epoch": 0.1288, "grad_norm": 2.171875, "grad_norm_var": 0.005549112955729167, "learning_rate": 0.0001, "loss": 4.4547, "loss/crossentropy": 2.203734040260315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23536919057369232, "step": 6440 }, { "epoch": 0.12884, "grad_norm": 2.046875, "grad_norm_var": 0.0055572509765625, "learning_rate": 0.0001, "loss": 4.354, "loss/crossentropy": 2.0419046878814697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121758982539177, "step": 6442 }, { "epoch": 0.12888, "grad_norm": 2.328125, "grad_norm_var": 0.0054595947265625, "learning_rate": 0.0001, "loss": 4.5738, "loss/crossentropy": 2.3551554679870605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23853591084480286, "step": 6444 }, { "epoch": 0.12892, "grad_norm": 2.21875, "grad_norm_var": 0.004813639322916666, "learning_rate": 0.0001, "loss": 4.4334, "loss/crossentropy": 1.9079387784004211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22546496242284775, "step": 6446 }, { "epoch": 0.12896, "grad_norm": 2.140625, "grad_norm_var": 0.00390625, "learning_rate": 0.0001, "loss": 4.4188, "loss/crossentropy": 1.9344156980514526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21923892199993134, "step": 6448 }, { "epoch": 0.129, "grad_norm": 2.296875, "grad_norm_var": 0.0047271728515625, "learning_rate": 0.0001, "loss": 4.3219, "loss/crossentropy": 1.7627189755439758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20726975798606873, "step": 6450 }, { "epoch": 0.12904, "grad_norm": 2.28125, "grad_norm_var": 0.0048736572265625, "learning_rate": 0.0001, "loss": 4.4045, "loss/crossentropy": 1.9988782405853271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23156572133302689, "step": 6452 }, { "epoch": 0.12908, "grad_norm": 2.1875, "grad_norm_var": 0.004621378580729167, "learning_rate": 0.0001, "loss": 4.6551, "loss/crossentropy": 2.3970296382904053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24723558872938156, "step": 6454 }, { "epoch": 0.12912, "grad_norm": 2.234375, "grad_norm_var": 0.00455322265625, "learning_rate": 0.0001, "loss": 4.3785, "loss/crossentropy": 2.009088099002838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188432812690735, "step": 6456 }, { "epoch": 0.12916, "grad_norm": 2.203125, "grad_norm_var": 0.005882771809895834, "learning_rate": 0.0001, "loss": 4.9189, "loss/crossentropy": 2.166573464870453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22506655752658844, "step": 6458 }, { "epoch": 0.1292, "grad_norm": 2.109375, "grad_norm_var": 0.00670166015625, "learning_rate": 0.0001, "loss": 4.3281, "loss/crossentropy": 2.1567386388778687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23143760859966278, "step": 6460 }, { "epoch": 0.12924, "grad_norm": 2.203125, "grad_norm_var": 0.007307942708333333, "learning_rate": 0.0001, "loss": 3.9726, "loss/crossentropy": 1.8458901643753052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22627578675746918, "step": 6462 }, { "epoch": 0.12928, "grad_norm": 2.21875, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 4.1416, "loss/crossentropy": 1.7933887243270874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944974958896637, "step": 6464 }, { "epoch": 0.12932, "grad_norm": 2.328125, "grad_norm_var": 0.0077707926432291664, "learning_rate": 0.0001, "loss": 4.3712, "loss/crossentropy": 2.14457631111145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2461908757686615, "step": 6466 }, { "epoch": 0.12936, "grad_norm": 1.9921875, "grad_norm_var": 0.011563873291015625, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.2191081047058105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24845656007528305, "step": 6468 }, { "epoch": 0.1294, "grad_norm": 2.3125, "grad_norm_var": 0.012284088134765624, "learning_rate": 0.0001, "loss": 4.3992, "loss/crossentropy": 2.2655181884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24961909651756287, "step": 6470 }, { "epoch": 0.12944, "grad_norm": 2.171875, "grad_norm_var": 0.012617746988932291, "learning_rate": 0.0001, "loss": 4.2431, "loss/crossentropy": 1.9992872476577759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21180924773216248, "step": 6472 }, { "epoch": 0.12948, "grad_norm": 2.078125, "grad_norm_var": 0.009348297119140625, "learning_rate": 0.0001, "loss": 4.2865, "loss/crossentropy": 2.1950103044509888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23401658236980438, "step": 6474 }, { "epoch": 0.12952, "grad_norm": 2.234375, "grad_norm_var": 0.010027821858723958, "learning_rate": 0.0001, "loss": 4.2614, "loss/crossentropy": 2.148743689060211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146785706281662, "step": 6476 }, { "epoch": 0.12956, "grad_norm": 2.140625, "grad_norm_var": 0.010253651936848959, "learning_rate": 0.0001, "loss": 4.7196, "loss/crossentropy": 2.4312193393707275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2596089243888855, "step": 6478 }, { "epoch": 0.1296, "grad_norm": 2.25, "grad_norm_var": 0.011844635009765625, "learning_rate": 0.0001, "loss": 4.7232, "loss/crossentropy": 2.061104893684387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24564718455076218, "step": 6480 }, { "epoch": 0.12964, "grad_norm": 2.21875, "grad_norm_var": 0.011224110921223959, "learning_rate": 0.0001, "loss": 4.6833, "loss/crossentropy": 2.1994687914848328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352369725704193, "step": 6482 }, { "epoch": 0.12968, "grad_norm": 2.203125, "grad_norm_var": 0.010692342122395834, "learning_rate": 0.0001, "loss": 4.2378, "loss/crossentropy": 2.085163116455078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2309635877609253, "step": 6484 }, { "epoch": 0.12972, "grad_norm": 2.1875, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.5964, "loss/crossentropy": 2.3360198736190796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24846713244915009, "step": 6486 }, { "epoch": 0.12976, "grad_norm": 2.265625, "grad_norm_var": 0.010602823893229167, "learning_rate": 0.0001, "loss": 4.4454, "loss/crossentropy": 2.089003086090088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24980145692825317, "step": 6488 }, { "epoch": 0.1298, "grad_norm": 2.125, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.4446, "loss/crossentropy": 2.0884299874305725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23875930905342102, "step": 6490 }, { "epoch": 0.12984, "grad_norm": 2.09375, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.1661, "loss/crossentropy": 1.9070702195167542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21942836046218872, "step": 6492 }, { "epoch": 0.12988, "grad_norm": 2.609375, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 4.6558, "loss/crossentropy": 2.1617711782455444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22820374369621277, "step": 6494 }, { "epoch": 0.12992, "grad_norm": 2.15625, "grad_norm_var": 0.0203765869140625, "learning_rate": 0.0001, "loss": 4.0938, "loss/crossentropy": 1.7782898545265198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22162485867738724, "step": 6496 }, { "epoch": 0.12996, "grad_norm": 2.1875, "grad_norm_var": 0.021028645833333335, "learning_rate": 0.0001, "loss": 4.4198, "loss/crossentropy": 2.1364612579345703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490044683218002, "step": 6498 }, { "epoch": 0.13, "grad_norm": 2.34375, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 4.569, "loss/crossentropy": 2.282773971557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24553906172513962, "step": 6500 }, { "epoch": 0.13004, "grad_norm": 2.171875, "grad_norm_var": 0.020905558268229166, "learning_rate": 0.0001, "loss": 4.4965, "loss/crossentropy": 2.008001983165741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22152648121118546, "step": 6502 }, { "epoch": 0.13008, "grad_norm": 2.328125, "grad_norm_var": 0.021833292643229165, "learning_rate": 0.0001, "loss": 4.4535, "loss/crossentropy": 2.1681981086730957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26258186995983124, "step": 6504 }, { "epoch": 0.13012, "grad_norm": 2.328125, "grad_norm_var": 0.021761067708333335, "learning_rate": 0.0001, "loss": 4.6254, "loss/crossentropy": 2.5754435062408447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26315446197986603, "step": 6506 }, { "epoch": 0.13016, "grad_norm": 2.234375, "grad_norm_var": 0.021187337239583333, "learning_rate": 0.0001, "loss": 4.1505, "loss/crossentropy": 1.7897658348083496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22884277999401093, "step": 6508 }, { "epoch": 0.1302, "grad_norm": 2.46875, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 4.6315, "loss/crossentropy": 1.8081435561180115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21860769391059875, "step": 6510 }, { "epoch": 0.13024, "grad_norm": 2.359375, "grad_norm_var": 0.0140289306640625, "learning_rate": 0.0001, "loss": 4.7371, "loss/crossentropy": 2.243759036064148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545628473162651, "step": 6512 }, { "epoch": 0.13028, "grad_norm": 2.1875, "grad_norm_var": 0.03447265625, "learning_rate": 0.0001, "loss": 4.6147, "loss/crossentropy": 2.069986939430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20862630754709244, "step": 6514 }, { "epoch": 0.13032, "grad_norm": 2.203125, "grad_norm_var": 0.0349761962890625, "learning_rate": 0.0001, "loss": 4.4758, "loss/crossentropy": 2.052341878414154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460884153842926, "step": 6516 }, { "epoch": 0.13036, "grad_norm": 2.390625, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 4.6369, "loss/crossentropy": 2.3376708030700684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25382688641548157, "step": 6518 }, { "epoch": 0.1304, "grad_norm": 2.125, "grad_norm_var": 0.0329498291015625, "learning_rate": 0.0001, "loss": 4.3048, "loss/crossentropy": 1.8329171538352966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2135135680437088, "step": 6520 }, { "epoch": 0.13044, "grad_norm": 2.421875, "grad_norm_var": 0.035791015625, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.077217698097229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265045866370201, "step": 6522 }, { "epoch": 0.13048, "grad_norm": 2.171875, "grad_norm_var": 0.036783854166666664, "learning_rate": 0.0001, "loss": 4.2189, "loss/crossentropy": 2.0677568912506104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2468869537115097, "step": 6524 }, { "epoch": 0.13052, "grad_norm": 2.296875, "grad_norm_var": 0.03591206868489583, "learning_rate": 0.0001, "loss": 4.5246, "loss/crossentropy": 2.0692074298858643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2602302059531212, "step": 6526 }, { "epoch": 0.13056, "grad_norm": 2.421875, "grad_norm_var": 0.03583984375, "learning_rate": 0.0001, "loss": 4.6709, "loss/crossentropy": 1.9767250418663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232941836118698, "step": 6528 }, { "epoch": 0.1306, "grad_norm": 2.234375, "grad_norm_var": 0.01480712890625, "learning_rate": 0.0001, "loss": 4.4851, "loss/crossentropy": 1.7538996934890747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20786649733781815, "step": 6530 }, { "epoch": 0.13064, "grad_norm": 2.203125, "grad_norm_var": 0.01578369140625, "learning_rate": 0.0001, "loss": 4.5598, "loss/crossentropy": 2.061249256134033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23238955438137054, "step": 6532 }, { "epoch": 0.13068, "grad_norm": 2.171875, "grad_norm_var": 0.0167877197265625, "learning_rate": 0.0001, "loss": 4.237, "loss/crossentropy": 1.9083253145217896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22536182403564453, "step": 6534 }, { "epoch": 0.13072, "grad_norm": 2.046875, "grad_norm_var": 0.017952473958333333, "learning_rate": 0.0001, "loss": 4.3004, "loss/crossentropy": 2.052153766155243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2343958392739296, "step": 6536 }, { "epoch": 0.13076, "grad_norm": 2.125, "grad_norm_var": 0.010091145833333334, "learning_rate": 0.0001, "loss": 4.3638, "loss/crossentropy": 1.9979270100593567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20102836191654205, "step": 6538 }, { "epoch": 0.1308, "grad_norm": 1.9921875, "grad_norm_var": 0.012031809488932291, "learning_rate": 0.0001, "loss": 4.019, "loss/crossentropy": 1.618333637714386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21146921813488007, "step": 6540 }, { "epoch": 0.13084, "grad_norm": 2.078125, "grad_norm_var": 0.011940256754557291, "learning_rate": 0.0001, "loss": 4.2514, "loss/crossentropy": 1.771790623664856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21661869436502457, "step": 6542 }, { "epoch": 0.13088, "grad_norm": 2.25, "grad_norm_var": 0.007165273030598958, "learning_rate": 0.0001, "loss": 4.3528, "loss/crossentropy": 2.2347733974456787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2507361173629761, "step": 6544 }, { "epoch": 0.13092, "grad_norm": 2.328125, "grad_norm_var": 0.012389882405598959, "learning_rate": 0.0001, "loss": 4.4333, "loss/crossentropy": 2.226397395133972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.257301464676857, "step": 6546 }, { "epoch": 0.13096, "grad_norm": 2.125, "grad_norm_var": 0.012776438395182292, "learning_rate": 0.0001, "loss": 4.1629, "loss/crossentropy": 1.9884281158447266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2213538959622383, "step": 6548 }, { "epoch": 0.131, "grad_norm": 2.0625, "grad_norm_var": 0.01755549112955729, "learning_rate": 0.0001, "loss": 4.549, "loss/crossentropy": 2.0790398120880127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2259911745786667, "step": 6550 }, { "epoch": 0.13104, "grad_norm": 2.28125, "grad_norm_var": 0.016294097900390624, "learning_rate": 0.0001, "loss": 4.5118, "loss/crossentropy": 2.1309107542037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23984672129154205, "step": 6552 }, { "epoch": 0.13108, "grad_norm": 2.296875, "grad_norm_var": 0.01744562784830729, "learning_rate": 0.0001, "loss": 4.4528, "loss/crossentropy": 1.9651963114738464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21699358522891998, "step": 6554 }, { "epoch": 0.13112, "grad_norm": 2.21875, "grad_norm_var": 0.014598592122395834, "learning_rate": 0.0001, "loss": 4.436, "loss/crossentropy": 2.405247449874878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24624405801296234, "step": 6556 }, { "epoch": 0.13116, "grad_norm": 2.125, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 4.3984, "loss/crossentropy": 2.215611457824707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24835016578435898, "step": 6558 }, { "epoch": 0.1312, "grad_norm": 4.125, "grad_norm_var": 0.2436920166015625, "learning_rate": 0.0001, "loss": 4.3876, "loss/crossentropy": 2.1731653809547424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24633550643920898, "step": 6560 }, { "epoch": 0.13124, "grad_norm": 2.234375, "grad_norm_var": 0.24501953125, "learning_rate": 0.0001, "loss": 4.5101, "loss/crossentropy": 2.0507587790489197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25702129304409027, "step": 6562 }, { "epoch": 0.13128, "grad_norm": 2.375, "grad_norm_var": 0.2433502197265625, "learning_rate": 0.0001, "loss": 4.3464, "loss/crossentropy": 2.2088446617126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22610870003700256, "step": 6564 }, { "epoch": 0.13132, "grad_norm": 2.34375, "grad_norm_var": 0.23931884765625, "learning_rate": 0.0001, "loss": 4.3612, "loss/crossentropy": 1.6911352276802063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188590243458748, "step": 6566 }, { "epoch": 0.13136, "grad_norm": 2.09375, "grad_norm_var": 0.24976806640625, "learning_rate": 0.0001, "loss": 3.9795, "loss/crossentropy": 1.807699978351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21844930946826935, "step": 6568 }, { "epoch": 0.1314, "grad_norm": 2.28125, "grad_norm_var": 0.24514872233072918, "learning_rate": 0.0001, "loss": 4.4293, "loss/crossentropy": 2.292602300643921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242776520550251, "step": 6570 }, { "epoch": 0.13144, "grad_norm": 2.03125, "grad_norm_var": 0.24806315104166668, "learning_rate": 0.0001, "loss": 4.07, "loss/crossentropy": 1.5262329578399658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1674257293343544, "step": 6572 }, { "epoch": 0.13148, "grad_norm": 2.15625, "grad_norm_var": 0.24504801432291667, "learning_rate": 0.0001, "loss": 4.4084, "loss/crossentropy": 2.180319309234619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2260192409157753, "step": 6574 }, { "epoch": 0.13152, "grad_norm": 2.265625, "grad_norm_var": 0.0216705322265625, "learning_rate": 0.0001, "loss": 4.544, "loss/crossentropy": 2.188440203666687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23389383405447006, "step": 6576 }, { "epoch": 0.13156, "grad_norm": 2.125, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 4.7595, "loss/crossentropy": 2.20253586769104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3217846751213074, "step": 6578 }, { "epoch": 0.1316, "grad_norm": 2.359375, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 4.2771, "loss/crossentropy": 2.1901479959487915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24521923065185547, "step": 6580 }, { "epoch": 0.13164, "grad_norm": 2.6875, "grad_norm_var": 0.04170633951822917, "learning_rate": 0.0001, "loss": 4.7105, "loss/crossentropy": 2.0719348192214966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2640562355518341, "step": 6582 }, { "epoch": 0.13168, "grad_norm": 2.1875, "grad_norm_var": 0.040339152018229164, "learning_rate": 0.0001, "loss": 4.1487, "loss/crossentropy": 2.331762194633484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24890532344579697, "step": 6584 }, { "epoch": 0.13172, "grad_norm": 2.34375, "grad_norm_var": 0.04429931640625, "learning_rate": 0.0001, "loss": 4.6013, "loss/crossentropy": 1.5143779516220093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1957392692565918, "step": 6586 }, { "epoch": 0.13176, "grad_norm": 2.171875, "grad_norm_var": 0.04088134765625, "learning_rate": 0.0001, "loss": 4.4311, "loss/crossentropy": 2.301910698413849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27353671938180923, "step": 6588 }, { "epoch": 0.1318, "grad_norm": 2.84375, "grad_norm_var": 0.11796773274739583, "learning_rate": 0.0001, "loss": 4.5035, "loss/crossentropy": 2.018579602241516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24261966347694397, "step": 6590 }, { "epoch": 0.13184, "grad_norm": 2.203125, "grad_norm_var": 0.11685282389322917, "learning_rate": 0.0001, "loss": 4.6931, "loss/crossentropy": 1.9749983549118042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727626234292984, "step": 6592 }, { "epoch": 0.13188, "grad_norm": 2.0, "grad_norm_var": 0.11741536458333333, "learning_rate": 0.0001, "loss": 4.1929, "loss/crossentropy": 2.040414035320282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22879169881343842, "step": 6594 }, { "epoch": 0.13192, "grad_norm": 2.453125, "grad_norm_var": 0.11551106770833333, "learning_rate": 0.0001, "loss": 4.3331, "loss/crossentropy": 1.941766619682312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591909021139145, "step": 6596 }, { "epoch": 0.13196, "grad_norm": 2.34375, "grad_norm_var": 0.10946858723958333, "learning_rate": 0.0001, "loss": 4.3659, "loss/crossentropy": 1.8487200140953064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23654203116893768, "step": 6598 }, { "epoch": 0.132, "grad_norm": 2.109375, "grad_norm_var": 0.10340067545572916, "learning_rate": 0.0001, "loss": 4.225, "loss/crossentropy": 1.9297338724136353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016594037413597, "step": 6600 }, { "epoch": 0.13204, "grad_norm": 2.046875, "grad_norm_var": 0.11005757649739584, "learning_rate": 0.0001, "loss": 4.3643, "loss/crossentropy": 2.048487663269043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20044995844364166, "step": 6602 }, { "epoch": 0.13208, "grad_norm": 2.0625, "grad_norm_var": 0.11295572916666667, "learning_rate": 0.0001, "loss": 4.425, "loss/crossentropy": 2.3620439767837524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24750277400016785, "step": 6604 }, { "epoch": 0.13212, "grad_norm": 2.328125, "grad_norm_var": 0.0158203125, "learning_rate": 0.0001, "loss": 4.2578, "loss/crossentropy": 2.1324113607406616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23109012842178345, "step": 6606 }, { "epoch": 0.13216, "grad_norm": 2.1875, "grad_norm_var": 0.015620930989583334, "learning_rate": 0.0001, "loss": 4.4775, "loss/crossentropy": 2.021477997303009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152409330010414, "step": 6608 }, { "epoch": 0.1322, "grad_norm": 2.1875, "grad_norm_var": 0.012669881184895834, "learning_rate": 0.0001, "loss": 4.6221, "loss/crossentropy": 2.1693036556243896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234656922519207, "step": 6610 }, { "epoch": 0.13224, "grad_norm": 2.203125, "grad_norm_var": 0.008854166666666666, "learning_rate": 0.0001, "loss": 4.1469, "loss/crossentropy": 2.1138017177581787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239084094762802, "step": 6612 }, { "epoch": 0.13228, "grad_norm": 2.28125, "grad_norm_var": 0.008772786458333333, "learning_rate": 0.0001, "loss": 4.4862, "loss/crossentropy": 2.2540252208709717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2614182382822037, "step": 6614 }, { "epoch": 0.13232, "grad_norm": 2.078125, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 4.2131, "loss/crossentropy": 2.033502757549286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2177339717745781, "step": 6616 }, { "epoch": 0.13236, "grad_norm": 2.078125, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 4.2648, "loss/crossentropy": 2.2490646839141846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22369390726089478, "step": 6618 }, { "epoch": 0.1324, "grad_norm": 2.21875, "grad_norm_var": 0.0150543212890625, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.14319908618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24394190311431885, "step": 6620 }, { "epoch": 0.13244, "grad_norm": 2.390625, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 4.5136, "loss/crossentropy": 2.045474410057068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22738997638225555, "step": 6622 }, { "epoch": 0.13248, "grad_norm": 2.203125, "grad_norm_var": 0.015816243489583333, "learning_rate": 0.0001, "loss": 4.5773, "loss/crossentropy": 2.1853290796279907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2565506473183632, "step": 6624 }, { "epoch": 0.13252, "grad_norm": 2.078125, "grad_norm_var": 0.017015584309895835, "learning_rate": 0.0001, "loss": 4.275, "loss/crossentropy": 2.1161083579063416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22949891537427902, "step": 6626 }, { "epoch": 0.13256, "grad_norm": 2.1875, "grad_norm_var": 0.0167633056640625, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.167048454284668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23087909072637558, "step": 6628 }, { "epoch": 0.1326, "grad_norm": 2.21875, "grad_norm_var": 0.0156158447265625, "learning_rate": 0.0001, "loss": 4.2777, "loss/crossentropy": 2.1544610261917114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25153525173664093, "step": 6630 }, { "epoch": 0.13264, "grad_norm": 2.484375, "grad_norm_var": 0.018244425455729168, "learning_rate": 0.0001, "loss": 4.4778, "loss/crossentropy": 2.0319228768348694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22067170590162277, "step": 6632 }, { "epoch": 0.13268, "grad_norm": 2.140625, "grad_norm_var": 0.0162261962890625, "learning_rate": 0.0001, "loss": 4.4398, "loss/crossentropy": 1.9723476767539978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21380966901779175, "step": 6634 }, { "epoch": 0.13272, "grad_norm": 2.796875, "grad_norm_var": 0.031590779622395836, "learning_rate": 0.0001, "loss": 4.3474, "loss/crossentropy": 2.2833333015441895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23245477676391602, "step": 6636 }, { "epoch": 0.13276, "grad_norm": 2.28125, "grad_norm_var": 0.03132222493489583, "learning_rate": 0.0001, "loss": 4.4522, "loss/crossentropy": 2.13793683052063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22665268182754517, "step": 6638 }, { "epoch": 0.1328, "grad_norm": 2.734375, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 4.6386, "loss/crossentropy": 2.188693881034851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2471916377544403, "step": 6640 }, { "epoch": 0.13284, "grad_norm": 2.234375, "grad_norm_var": 0.0438629150390625, "learning_rate": 0.0001, "loss": 4.6394, "loss/crossentropy": 2.169856071472168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23427975177764893, "step": 6642 }, { "epoch": 0.13288, "grad_norm": 2.15625, "grad_norm_var": 0.04365132649739583, "learning_rate": 0.0001, "loss": 4.6498, "loss/crossentropy": 2.1600061655044556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23366540670394897, "step": 6644 }, { "epoch": 0.13292, "grad_norm": 2.359375, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 4.4521, "loss/crossentropy": 1.822945475578308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203340008854866, "step": 6646 }, { "epoch": 0.13296, "grad_norm": 2.15625, "grad_norm_var": 0.045735677083333336, "learning_rate": 0.0001, "loss": 4.3263, "loss/crossentropy": 1.8908653259277344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320382297039032, "step": 6648 }, { "epoch": 0.133, "grad_norm": 2.265625, "grad_norm_var": 0.04363606770833333, "learning_rate": 0.0001, "loss": 4.5065, "loss/crossentropy": 2.126000165939331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24388836324214935, "step": 6650 }, { "epoch": 0.13304, "grad_norm": 2.328125, "grad_norm_var": 0.0242828369140625, "learning_rate": 0.0001, "loss": 4.5787, "loss/crossentropy": 2.434928297996521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24083568900823593, "step": 6652 }, { "epoch": 0.13308, "grad_norm": 2.203125, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 4.5538, "loss/crossentropy": 2.2186567783355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2220132276415825, "step": 6654 }, { "epoch": 0.13312, "grad_norm": 2.25, "grad_norm_var": 0.006257120768229167, "learning_rate": 0.0001, "loss": 4.4934, "loss/crossentropy": 1.849799931049347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21551364660263062, "step": 6656 }, { "epoch": 0.13316, "grad_norm": 2.109375, "grad_norm_var": 0.0072662353515625, "learning_rate": 0.0001, "loss": 4.2237, "loss/crossentropy": 2.082044243812561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21274320781230927, "step": 6658 }, { "epoch": 0.1332, "grad_norm": 2.125, "grad_norm_var": 0.010838826497395834, "learning_rate": 0.0001, "loss": 4.5884, "loss/crossentropy": 2.1957098245620728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23078418523073196, "step": 6660 }, { "epoch": 0.13324, "grad_norm": 2.28125, "grad_norm_var": 0.16033528645833334, "learning_rate": 0.0001, "loss": 4.519, "loss/crossentropy": 2.228309690952301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27178408950567245, "step": 6662 }, { "epoch": 0.13328, "grad_norm": 2.40625, "grad_norm_var": 0.156298828125, "learning_rate": 0.0001, "loss": 4.5987, "loss/crossentropy": 1.8185940384864807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145048901438713, "step": 6664 }, { "epoch": 0.13332, "grad_norm": 2.171875, "grad_norm_var": 0.1566558837890625, "learning_rate": 0.0001, "loss": 4.4722, "loss/crossentropy": 2.198649048805237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262621968984604, "step": 6666 }, { "epoch": 0.13336, "grad_norm": 2.15625, "grad_norm_var": 0.159130859375, "learning_rate": 0.0001, "loss": 4.5729, "loss/crossentropy": 2.2075835466384888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23531068861484528, "step": 6668 }, { "epoch": 0.1334, "grad_norm": 2.140625, "grad_norm_var": 0.16243387858072916, "learning_rate": 0.0001, "loss": 4.2913, "loss/crossentropy": 1.9719768166542053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206977903842926, "step": 6670 }, { "epoch": 0.13344, "grad_norm": 2.03125, "grad_norm_var": 0.17021484375, "learning_rate": 0.0001, "loss": 4.2144, "loss/crossentropy": 2.304553985595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23418369889259338, "step": 6672 }, { "epoch": 0.13348, "grad_norm": 2.25, "grad_norm_var": 0.1682281494140625, "learning_rate": 0.0001, "loss": 4.4485, "loss/crossentropy": 2.212409734725952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23888318240642548, "step": 6674 }, { "epoch": 0.13352, "grad_norm": 2.0625, "grad_norm_var": 0.19371337890625, "learning_rate": 0.0001, "loss": 4.176, "loss/crossentropy": 2.001897156238556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147163525223732, "step": 6676 }, { "epoch": 0.13356, "grad_norm": 2.203125, "grad_norm_var": 0.043680826822916664, "learning_rate": 0.0001, "loss": 4.4245, "loss/crossentropy": 2.216760039329529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24913202226161957, "step": 6678 }, { "epoch": 0.1336, "grad_norm": 2.09375, "grad_norm_var": 0.04597142537434896, "learning_rate": 0.0001, "loss": 4.1862, "loss/crossentropy": 1.8190750479698181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200186125934124, "step": 6680 }, { "epoch": 0.13364, "grad_norm": 2.140625, "grad_norm_var": 0.047548166910807294, "learning_rate": 0.0001, "loss": 4.7271, "loss/crossentropy": 2.311274528503418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22400956600904465, "step": 6682 }, { "epoch": 0.13368, "grad_norm": 2.34375, "grad_norm_var": 0.5293841044108073, "learning_rate": 0.0001, "loss": 4.3852, "loss/crossentropy": 2.0381893515586853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23101551085710526, "step": 6684 }, { "epoch": 0.13372, "grad_norm": 2.21875, "grad_norm_var": 0.5331776936848959, "learning_rate": 0.0001, "loss": 4.1185, "loss/crossentropy": 1.7441503405570984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20208899676799774, "step": 6686 }, { "epoch": 0.13376, "grad_norm": 2.171875, "grad_norm_var": 0.5252593994140625, "learning_rate": 0.0001, "loss": 4.3835, "loss/crossentropy": 1.8874938488006592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255900353193283, "step": 6688 }, { "epoch": 0.1338, "grad_norm": 2.3125, "grad_norm_var": 0.5284006754557292, "learning_rate": 0.0001, "loss": 4.2563, "loss/crossentropy": 2.2768125534057617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21636968851089478, "step": 6690 }, { "epoch": 0.13384, "grad_norm": 2.09375, "grad_norm_var": 0.5063954671223958, "learning_rate": 0.0001, "loss": 4.278, "loss/crossentropy": 2.0253931283950806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28937554359436035, "step": 6692 }, { "epoch": 0.13388, "grad_norm": 2.28125, "grad_norm_var": 0.5079661051432292, "learning_rate": 0.0001, "loss": 4.4205, "loss/crossentropy": 2.1940718293190002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23384064435958862, "step": 6694 }, { "epoch": 0.13392, "grad_norm": 2.078125, "grad_norm_var": 0.4987993876139323, "learning_rate": 0.0001, "loss": 4.5066, "loss/crossentropy": 2.071534514427185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146512120962143, "step": 6696 }, { "epoch": 0.13396, "grad_norm": 2.28125, "grad_norm_var": 0.5017534891764323, "learning_rate": 0.0001, "loss": 4.348, "loss/crossentropy": 1.8530714511871338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21837462484836578, "step": 6698 }, { "epoch": 0.134, "grad_norm": 2.28125, "grad_norm_var": 0.013396962483723959, "learning_rate": 0.0001, "loss": 4.521, "loss/crossentropy": 2.210664451122284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199932038784027, "step": 6700 }, { "epoch": 0.13404, "grad_norm": 2.171875, "grad_norm_var": 0.011506144205729167, "learning_rate": 0.0001, "loss": 4.4226, "loss/crossentropy": 1.8530223965644836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209881991147995, "step": 6702 }, { "epoch": 0.13408, "grad_norm": 2.234375, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 4.4302, "loss/crossentropy": 1.8609183430671692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22096765041351318, "step": 6704 }, { "epoch": 0.13412, "grad_norm": 2.59375, "grad_norm_var": 0.021214803059895832, "learning_rate": 0.0001, "loss": 4.8429, "loss/crossentropy": 2.33315110206604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26414938271045685, "step": 6706 }, { "epoch": 0.13416, "grad_norm": 2.359375, "grad_norm_var": 0.04798075358072917, "learning_rate": 0.0001, "loss": 4.6054, "loss/crossentropy": 2.2656116485595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2769838646054268, "step": 6708 }, { "epoch": 0.1342, "grad_norm": 2.1875, "grad_norm_var": 0.0466705322265625, "learning_rate": 0.0001, "loss": 4.4875, "loss/crossentropy": 2.2131590843200684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23640615493059158, "step": 6710 }, { "epoch": 0.13424, "grad_norm": 2.25, "grad_norm_var": 0.044831339518229166, "learning_rate": 0.0001, "loss": 4.1554, "loss/crossentropy": 1.8667671084403992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212738998234272, "step": 6712 }, { "epoch": 0.13428, "grad_norm": 2.203125, "grad_norm_var": 0.04480692545572917, "learning_rate": 0.0001, "loss": 4.0958, "loss/crossentropy": 1.9699830412864685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23507894575595856, "step": 6714 }, { "epoch": 0.13432, "grad_norm": 2.078125, "grad_norm_var": 0.04632161458333333, "learning_rate": 0.0001, "loss": 4.1571, "loss/crossentropy": 1.8108918070793152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192004919052124, "step": 6716 }, { "epoch": 0.13436, "grad_norm": 2.203125, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 4.4983, "loss/crossentropy": 2.0528674125671387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22221814841032028, "step": 6718 }, { "epoch": 0.1344, "grad_norm": 2.046875, "grad_norm_var": 0.046305338541666664, "learning_rate": 0.0001, "loss": 4.2544, "loss/crossentropy": 1.7881956696510315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21330178529024124, "step": 6720 }, { "epoch": 0.13444, "grad_norm": 2.3125, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 4.5424, "loss/crossentropy": 2.2016018629074097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22042546421289444, "step": 6722 }, { "epoch": 0.13448, "grad_norm": 2.171875, "grad_norm_var": 0.006322224934895833, "learning_rate": 0.0001, "loss": 4.3159, "loss/crossentropy": 1.9661846160888672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21542846411466599, "step": 6724 }, { "epoch": 0.13452, "grad_norm": 2.328125, "grad_norm_var": 0.031037394205729166, "learning_rate": 0.0001, "loss": 4.4473, "loss/crossentropy": 2.1073482036590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23370730876922607, "step": 6726 }, { "epoch": 0.13456, "grad_norm": 2.25, "grad_norm_var": 0.030826822916666666, "learning_rate": 0.0001, "loss": 4.6069, "loss/crossentropy": 2.1937917470932007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22414422780275345, "step": 6728 }, { "epoch": 0.1346, "grad_norm": 2.109375, "grad_norm_var": 0.030436197916666668, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 2.078732967376709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21227504312992096, "step": 6730 }, { "epoch": 0.13464, "grad_norm": 2.34375, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 4.3043, "loss/crossentropy": 1.9002525806427002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21004119515419006, "step": 6732 }, { "epoch": 0.13468, "grad_norm": 2.15625, "grad_norm_var": 0.030794270833333335, "learning_rate": 0.0001, "loss": 4.5156, "loss/crossentropy": 2.055518925189972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21610142290592194, "step": 6734 }, { "epoch": 0.13472, "grad_norm": 2.1875, "grad_norm_var": 0.028473917643229166, "learning_rate": 0.0001, "loss": 4.4889, "loss/crossentropy": 2.0521084666252136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23528365790843964, "step": 6736 }, { "epoch": 0.13476, "grad_norm": 2.203125, "grad_norm_var": 0.02730712890625, "learning_rate": 0.0001, "loss": 4.5051, "loss/crossentropy": 2.2536301612854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21544227004051208, "step": 6738 }, { "epoch": 0.1348, "grad_norm": 2.1875, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 4.5164, "loss/crossentropy": 2.2610143423080444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2532905787229538, "step": 6740 }, { "epoch": 0.13484, "grad_norm": 2.109375, "grad_norm_var": 0.00836181640625, "learning_rate": 0.0001, "loss": 4.1301, "loss/crossentropy": 1.8840081095695496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21514790505170822, "step": 6742 }, { "epoch": 0.13488, "grad_norm": 2.109375, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 4.2532, "loss/crossentropy": 2.0841002464294434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328692153096199, "step": 6744 }, { "epoch": 0.13492, "grad_norm": 2.390625, "grad_norm_var": 0.010798136393229166, "learning_rate": 0.0001, "loss": 4.6335, "loss/crossentropy": 2.507196068763733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2736222892999649, "step": 6746 }, { "epoch": 0.13496, "grad_norm": 2.421875, "grad_norm_var": 0.016527303059895835, "learning_rate": 0.0001, "loss": 4.189, "loss/crossentropy": 2.0180357098579407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22328195720911026, "step": 6748 }, { "epoch": 0.135, "grad_norm": 2.15625, "grad_norm_var": 0.016097005208333334, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.2457560300827026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26572495698928833, "step": 6750 }, { "epoch": 0.13504, "grad_norm": 2.6875, "grad_norm_var": 0.031493123372395834, "learning_rate": 0.0001, "loss": 4.885, "loss/crossentropy": 2.1280853748321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22716474533081055, "step": 6752 }, { "epoch": 0.13508, "grad_norm": 2.25, "grad_norm_var": 0.03216145833333333, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 2.1843650341033936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24630828201770782, "step": 6754 }, { "epoch": 0.13512, "grad_norm": 2.21875, "grad_norm_var": 0.03390299479166667, "learning_rate": 0.0001, "loss": 4.3584, "loss/crossentropy": 1.7955012917518616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614011585712433, "step": 6756 }, { "epoch": 0.13516, "grad_norm": 2.1875, "grad_norm_var": 0.028563435872395834, "learning_rate": 0.0001, "loss": 4.3546, "loss/crossentropy": 1.9315852522850037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22918011993169785, "step": 6758 }, { "epoch": 0.1352, "grad_norm": 2.140625, "grad_norm_var": 0.028055826822916668, "learning_rate": 0.0001, "loss": 4.2659, "loss/crossentropy": 1.982887327671051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21635843813419342, "step": 6760 }, { "epoch": 0.13524, "grad_norm": 2.0625, "grad_norm_var": 0.029215494791666668, "learning_rate": 0.0001, "loss": 4.2828, "loss/crossentropy": 2.25021892786026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22931701689958572, "step": 6762 }, { "epoch": 0.13528, "grad_norm": 2.34375, "grad_norm_var": 0.02252197265625, "learning_rate": 0.0001, "loss": 4.5991, "loss/crossentropy": 2.5220746994018555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2406606376171112, "step": 6764 }, { "epoch": 0.13532, "grad_norm": 2.375, "grad_norm_var": 0.022386678059895835, "learning_rate": 0.0001, "loss": 4.5143, "loss/crossentropy": 1.8115127086639404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220379076898098, "step": 6766 }, { "epoch": 0.13536, "grad_norm": 2.0625, "grad_norm_var": 0.010856119791666667, "learning_rate": 0.0001, "loss": 4.5113, "loss/crossentropy": 1.8998088240623474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22993376106023788, "step": 6768 }, { "epoch": 0.1354, "grad_norm": 2.09375, "grad_norm_var": 0.01051025390625, "learning_rate": 0.0001, "loss": 4.28, "loss/crossentropy": 2.0183660984039307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21991200000047684, "step": 6770 }, { "epoch": 0.13544, "grad_norm": 2.296875, "grad_norm_var": 0.009847005208333334, "learning_rate": 0.0001, "loss": 4.6224, "loss/crossentropy": 2.1927448511123657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536699026823044, "step": 6772 }, { "epoch": 0.13548, "grad_norm": 2.234375, "grad_norm_var": 0.014404296875, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.92184317111969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20716708898544312, "step": 6774 }, { "epoch": 0.13552, "grad_norm": 2.296875, "grad_norm_var": 0.016813151041666665, "learning_rate": 0.0001, "loss": 4.7779, "loss/crossentropy": 2.2437468767166138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26381388306617737, "step": 6776 }, { "epoch": 0.13556, "grad_norm": 2.0625, "grad_norm_var": 0.016722615559895834, "learning_rate": 0.0001, "loss": 4.2907, "loss/crossentropy": 2.087414026260376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21566492319107056, "step": 6778 }, { "epoch": 0.1356, "grad_norm": 2.140625, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 4.4273, "loss/crossentropy": 2.1936367750167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22492723166942596, "step": 6780 }, { "epoch": 0.13564, "grad_norm": 2.109375, "grad_norm_var": 0.014046223958333333, "learning_rate": 0.0001, "loss": 4.2992, "loss/crossentropy": 1.7642306685447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20992937684059143, "step": 6782 }, { "epoch": 0.13568, "grad_norm": 2.140625, "grad_norm_var": 0.014371744791666667, "learning_rate": 0.0001, "loss": 4.3593, "loss/crossentropy": 2.01781964302063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22552715986967087, "step": 6784 }, { "epoch": 0.13572, "grad_norm": 2.140625, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 4.4708, "loss/crossentropy": 2.0788158774375916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24181769788265228, "step": 6786 }, { "epoch": 0.13576, "grad_norm": 2.171875, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 4.2163, "loss/crossentropy": 2.0424017310142517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21705424785614014, "step": 6788 }, { "epoch": 0.1358, "grad_norm": 2.46875, "grad_norm_var": 0.019025675455729165, "learning_rate": 0.0001, "loss": 4.1182, "loss/crossentropy": 1.6175345182418823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899409219622612, "step": 6790 }, { "epoch": 0.13584, "grad_norm": 2.3125, "grad_norm_var": 0.016852823893229167, "learning_rate": 0.0001, "loss": 4.2914, "loss/crossentropy": 2.004386007785797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22395263612270355, "step": 6792 }, { "epoch": 0.13588, "grad_norm": 2.109375, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 4.2182, "loss/crossentropy": 1.9224132895469666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2326791137456894, "step": 6794 }, { "epoch": 0.13592, "grad_norm": 2.1875, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 4.4768, "loss/crossentropy": 1.8331453204154968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21586360037326813, "step": 6796 }, { "epoch": 0.13596, "grad_norm": 2.1875, "grad_norm_var": 0.0197906494140625, "learning_rate": 0.0001, "loss": 4.3059, "loss/crossentropy": 2.535244107246399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24744001775979996, "step": 6798 }, { "epoch": 0.136, "grad_norm": 2.25, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 4.4444, "loss/crossentropy": 2.0433249473571777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502296343445778, "step": 6800 }, { "epoch": 0.13604, "grad_norm": 2.09375, "grad_norm_var": 0.0221588134765625, "learning_rate": 0.0001, "loss": 4.4619, "loss/crossentropy": 2.35608172416687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536633685231209, "step": 6802 }, { "epoch": 0.13608, "grad_norm": 2.1875, "grad_norm_var": 0.022362263997395833, "learning_rate": 0.0001, "loss": 4.4493, "loss/crossentropy": 2.1230265498161316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276441976428032, "step": 6804 }, { "epoch": 0.13612, "grad_norm": 2.15625, "grad_norm_var": 0.015925089518229168, "learning_rate": 0.0001, "loss": 4.2125, "loss/crossentropy": 2.0186346769332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22158686816692352, "step": 6806 }, { "epoch": 0.13616, "grad_norm": 2.28125, "grad_norm_var": 0.0151763916015625, "learning_rate": 0.0001, "loss": 4.6065, "loss/crossentropy": 2.4136343002319336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22668009996414185, "step": 6808 }, { "epoch": 0.1362, "grad_norm": 2.109375, "grad_norm_var": 0.012495930989583333, "learning_rate": 0.0001, "loss": 4.3372, "loss/crossentropy": 2.1241788268089294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23394256830215454, "step": 6810 }, { "epoch": 0.13624, "grad_norm": 2.4375, "grad_norm_var": 0.017145792643229168, "learning_rate": 0.0001, "loss": 4.6063, "loss/crossentropy": 1.9051874279975891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560805454850197, "step": 6812 }, { "epoch": 0.13628, "grad_norm": 2.234375, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 4.2419, "loss/crossentropy": 1.9248363375663757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22457116842269897, "step": 6814 }, { "epoch": 0.13632, "grad_norm": 1.921875, "grad_norm_var": 0.022264607747395835, "learning_rate": 0.0001, "loss": 4.5953, "loss/crossentropy": 2.3065048456192017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.245137557387352, "step": 6816 }, { "epoch": 0.13636, "grad_norm": 2.234375, "grad_norm_var": 0.0152496337890625, "learning_rate": 0.0001, "loss": 4.6096, "loss/crossentropy": 2.152611255645752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22794383764266968, "step": 6818 }, { "epoch": 0.1364, "grad_norm": 2.09375, "grad_norm_var": 0.016242472330729167, "learning_rate": 0.0001, "loss": 4.129, "loss/crossentropy": 1.9548735618591309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20764236897230148, "step": 6820 }, { "epoch": 0.13644, "grad_norm": 2.1875, "grad_norm_var": 0.016136678059895833, "learning_rate": 0.0001, "loss": 4.3501, "loss/crossentropy": 2.11979341506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22793393582105637, "step": 6822 }, { "epoch": 0.13648, "grad_norm": 2.15625, "grad_norm_var": 0.016748046875, "learning_rate": 0.0001, "loss": 4.3338, "loss/crossentropy": 2.351949691772461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366446554660797, "step": 6824 }, { "epoch": 0.13652, "grad_norm": 2.0625, "grad_norm_var": 0.018843587239583334, "learning_rate": 0.0001, "loss": 4.3764, "loss/crossentropy": 2.1197460889816284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24759536981582642, "step": 6826 }, { "epoch": 0.13656, "grad_norm": 2.0625, "grad_norm_var": 0.013264973958333334, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 1.8491687178611755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777105540037155, "step": 6828 }, { "epoch": 0.1366, "grad_norm": 2.25, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 4.4867, "loss/crossentropy": 1.964136004447937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2368694394826889, "step": 6830 }, { "epoch": 0.13664, "grad_norm": 2.140625, "grad_norm_var": 0.006859334309895834, "learning_rate": 0.0001, "loss": 4.469, "loss/crossentropy": 1.8988104462623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19548720866441727, "step": 6832 }, { "epoch": 0.13668, "grad_norm": 2.125, "grad_norm_var": 0.0059855143229166664, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 1.757002353668213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21120281517505646, "step": 6834 }, { "epoch": 0.13672, "grad_norm": 2.109375, "grad_norm_var": 0.005952962239583333, "learning_rate": 0.0001, "loss": 4.4239, "loss/crossentropy": 1.9414420127868652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22160177677869797, "step": 6836 }, { "epoch": 0.13676, "grad_norm": 2.0625, "grad_norm_var": 0.0072174072265625, "learning_rate": 0.0001, "loss": 4.4294, "loss/crossentropy": 2.280028223991394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24696747958660126, "step": 6838 }, { "epoch": 0.1368, "grad_norm": 2.203125, "grad_norm_var": 0.006787109375, "learning_rate": 0.0001, "loss": 4.523, "loss/crossentropy": 2.106986403465271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24022039771080017, "step": 6840 }, { "epoch": 0.13684, "grad_norm": 2.15625, "grad_norm_var": 0.0349517822265625, "learning_rate": 0.0001, "loss": 4.2609, "loss/crossentropy": 1.9540700912475586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968660056591034, "step": 6842 }, { "epoch": 0.13688, "grad_norm": 2.0625, "grad_norm_var": 0.03394775390625, "learning_rate": 0.0001, "loss": 4.2181, "loss/crossentropy": 1.6771780252456665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20230162143707275, "step": 6844 }, { "epoch": 0.13692, "grad_norm": 2.5625, "grad_norm_var": 0.04419657389322917, "learning_rate": 0.0001, "loss": 4.7567, "loss/crossentropy": 2.059873402118683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.276339128613472, "step": 6846 }, { "epoch": 0.13696, "grad_norm": 2.0625, "grad_norm_var": 0.04684244791666667, "learning_rate": 0.0001, "loss": 4.2512, "loss/crossentropy": 1.7943353056907654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893974468111992, "step": 6848 }, { "epoch": 0.137, "grad_norm": 2.265625, "grad_norm_var": 0.046223958333333336, "learning_rate": 0.0001, "loss": 4.687, "loss/crossentropy": 2.314136028289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24898302555084229, "step": 6850 }, { "epoch": 0.13704, "grad_norm": 2.328125, "grad_norm_var": 0.04383036295572917, "learning_rate": 0.0001, "loss": 4.4257, "loss/crossentropy": 2.0062466263771057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23169535398483276, "step": 6852 }, { "epoch": 0.13708, "grad_norm": 2.09375, "grad_norm_var": 0.044873046875, "learning_rate": 0.0001, "loss": 4.5787, "loss/crossentropy": 2.3600821495056152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25484780967235565, "step": 6854 }, { "epoch": 0.13712, "grad_norm": 2.28125, "grad_norm_var": 0.043675740559895836, "learning_rate": 0.0001, "loss": 4.2113, "loss/crossentropy": 1.885023295879364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.255520723760128, "step": 6856 }, { "epoch": 0.13716, "grad_norm": 2.234375, "grad_norm_var": 0.019806925455729166, "learning_rate": 0.0001, "loss": 4.6493, "loss/crossentropy": 2.2864162921905518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22469403594732285, "step": 6858 }, { "epoch": 0.1372, "grad_norm": 2.09375, "grad_norm_var": 0.018961588541666668, "learning_rate": 0.0001, "loss": 4.4017, "loss/crossentropy": 1.908643126487732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22016740590333939, "step": 6860 }, { "epoch": 0.13724, "grad_norm": 2.3125, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 4.7255, "loss/crossentropy": 2.1028786301612854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23632919788360596, "step": 6862 }, { "epoch": 0.13728, "grad_norm": 2.546875, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 4.4043, "loss/crossentropy": 2.0363592505455017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22131157666444778, "step": 6864 }, { "epoch": 0.13732, "grad_norm": 2.171875, "grad_norm_var": 0.0158843994140625, "learning_rate": 0.0001, "loss": 4.4357, "loss/crossentropy": 2.030495524406433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233994759619236, "step": 6866 }, { "epoch": 0.13736, "grad_norm": 2.046875, "grad_norm_var": 0.018033854166666665, "learning_rate": 0.0001, "loss": 4.3036, "loss/crossentropy": 1.6365603804588318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909056007862091, "step": 6868 }, { "epoch": 0.1374, "grad_norm": 2.0625, "grad_norm_var": 0.015217081705729166, "learning_rate": 0.0001, "loss": 4.655, "loss/crossentropy": 2.205111026763916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2267644703388214, "step": 6870 }, { "epoch": 0.13744, "grad_norm": 2.0625, "grad_norm_var": 0.016437784830729166, "learning_rate": 0.0001, "loss": 4.4207, "loss/crossentropy": 2.0179646015167236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2307143434882164, "step": 6872 }, { "epoch": 0.13748, "grad_norm": 2.09375, "grad_norm_var": 0.018290201822916668, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 1.9697463512420654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207925096154213, "step": 6874 }, { "epoch": 0.13752, "grad_norm": 2.265625, "grad_norm_var": 0.019189453125, "learning_rate": 0.0001, "loss": 4.5691, "loss/crossentropy": 2.186875820159912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25762687623500824, "step": 6876 }, { "epoch": 0.13756, "grad_norm": 2.109375, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 4.2713, "loss/crossentropy": 2.3203837871551514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23470903187990189, "step": 6878 }, { "epoch": 0.1376, "grad_norm": 2.1875, "grad_norm_var": 0.0066640218098958336, "learning_rate": 0.0001, "loss": 4.4154, "loss/crossentropy": 2.2642472982406616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2454553246498108, "step": 6880 }, { "epoch": 0.13764, "grad_norm": 2.1875, "grad_norm_var": 0.008687337239583334, "learning_rate": 0.0001, "loss": 4.4076, "loss/crossentropy": 1.9313859343528748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21797804534435272, "step": 6882 }, { "epoch": 0.13768, "grad_norm": 3.4375, "grad_norm_var": 0.11204020182291667, "learning_rate": 0.0001, "loss": 4.7846, "loss/crossentropy": 2.5469977855682373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2651800066232681, "step": 6884 }, { "epoch": 0.13772, "grad_norm": 2.296875, "grad_norm_var": 0.11030171712239584, "learning_rate": 0.0001, "loss": 4.4893, "loss/crossentropy": 2.549328088760376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2566085457801819, "step": 6886 }, { "epoch": 0.13776, "grad_norm": 2.03125, "grad_norm_var": 0.1126617431640625, "learning_rate": 0.0001, "loss": 4.4927, "loss/crossentropy": 2.3094369769096375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24709751456975937, "step": 6888 }, { "epoch": 0.1378, "grad_norm": 2.109375, "grad_norm_var": 0.1097808837890625, "learning_rate": 0.0001, "loss": 4.2412, "loss/crossentropy": 1.5071046948432922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19776180386543274, "step": 6890 }, { "epoch": 0.13784, "grad_norm": 2.078125, "grad_norm_var": 0.1101226806640625, "learning_rate": 0.0001, "loss": 4.3705, "loss/crossentropy": 2.0064221620559692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20836234837770462, "step": 6892 }, { "epoch": 0.13788, "grad_norm": 2.078125, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 4.3608, "loss/crossentropy": 2.1216301321983337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22151901572942734, "step": 6894 }, { "epoch": 0.13792, "grad_norm": 2.25, "grad_norm_var": 0.1073150634765625, "learning_rate": 0.0001, "loss": 4.4168, "loss/crossentropy": 1.8417679071426392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21999332308769226, "step": 6896 }, { "epoch": 0.13796, "grad_norm": 2.328125, "grad_norm_var": 0.10695699055989584, "learning_rate": 0.0001, "loss": 4.7005, "loss/crossentropy": 2.4651769399642944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27029043436050415, "step": 6898 }, { "epoch": 0.138, "grad_norm": 2.21875, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 1.7225988507270813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20930497348308563, "step": 6900 }, { "epoch": 0.13804, "grad_norm": 2.34375, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 4.1156, "loss/crossentropy": 2.1218297481536865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21276423335075378, "step": 6902 }, { "epoch": 0.13808, "grad_norm": 2.25, "grad_norm_var": 0.010856119791666667, "learning_rate": 0.0001, "loss": 4.2706, "loss/crossentropy": 2.040019452571869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2438269630074501, "step": 6904 }, { "epoch": 0.13812, "grad_norm": 15.8125, "grad_norm_var": 11.600536092122395, "learning_rate": 0.0001, "loss": 4.5041, "loss/crossentropy": 1.8229625225067139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22197778522968292, "step": 6906 }, { "epoch": 0.13816, "grad_norm": 2.25, "grad_norm_var": 11.543973795572917, "learning_rate": 0.0001, "loss": 4.7087, "loss/crossentropy": 2.453408360481262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24046239256858826, "step": 6908 }, { "epoch": 0.1382, "grad_norm": 2.046875, "grad_norm_var": 11.55152587890625, "learning_rate": 0.0001, "loss": 4.4806, "loss/crossentropy": 2.2724320888519287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23739346861839294, "step": 6910 }, { "epoch": 0.13824, "grad_norm": 2.125, "grad_norm_var": 11.562272135416666, "learning_rate": 0.0001, "loss": 4.2747, "loss/crossentropy": 2.2382686138153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22692064195871353, "step": 6912 }, { "epoch": 0.13828, "grad_norm": 2.3125, "grad_norm_var": 11.54869384765625, "learning_rate": 0.0001, "loss": 4.6867, "loss/crossentropy": 2.021562337875366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24746088683605194, "step": 6914 }, { "epoch": 0.13832, "grad_norm": 2.21875, "grad_norm_var": 11.546556599934895, "learning_rate": 0.0001, "loss": 4.3247, "loss/crossentropy": 2.1071943044662476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144002616405487, "step": 6916 }, { "epoch": 0.13836, "grad_norm": 2.28125, "grad_norm_var": 11.550846354166667, "learning_rate": 0.0001, "loss": 4.2686, "loss/crossentropy": 1.9641517400741577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22584721446037292, "step": 6918 }, { "epoch": 0.1384, "grad_norm": 2.078125, "grad_norm_var": 11.559130859375, "learning_rate": 0.0001, "loss": 4.3194, "loss/crossentropy": 2.4430564641952515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24351171404123306, "step": 6920 }, { "epoch": 0.13844, "grad_norm": 2.125, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 4.4202, "loss/crossentropy": 2.2237725257873535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248261496424675, "step": 6922 }, { "epoch": 0.13848, "grad_norm": 2.09375, "grad_norm_var": 0.015412394205729167, "learning_rate": 0.0001, "loss": 4.2028, "loss/crossentropy": 1.7291913628578186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19805100560188293, "step": 6924 }, { "epoch": 0.13852, "grad_norm": 2.15625, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.3972, "loss/crossentropy": 1.807108223438263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20386626571416855, "step": 6926 }, { "epoch": 0.13856, "grad_norm": 2.28125, "grad_norm_var": 0.01422119140625, "learning_rate": 0.0001, "loss": 4.5188, "loss/crossentropy": 2.510676622390747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24666880816221237, "step": 6928 }, { "epoch": 0.1386, "grad_norm": 2.015625, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 4.2006, "loss/crossentropy": 1.9420115947723389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2214776575565338, "step": 6930 }, { "epoch": 0.13864, "grad_norm": 2.3125, "grad_norm_var": 0.010837554931640625, "learning_rate": 0.0001, "loss": 4.4445, "loss/crossentropy": 2.2288190722465515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23719585686922073, "step": 6932 }, { "epoch": 0.13868, "grad_norm": 2.09375, "grad_norm_var": 0.011043294270833334, "learning_rate": 0.0001, "loss": 4.071, "loss/crossentropy": 2.04274183511734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21653369069099426, "step": 6934 }, { "epoch": 0.13872, "grad_norm": 2.296875, "grad_norm_var": 0.0121002197265625, "learning_rate": 0.0001, "loss": 4.4041, "loss/crossentropy": 1.9149779081344604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19941364973783493, "step": 6936 }, { "epoch": 0.13876, "grad_norm": 2.15625, "grad_norm_var": 0.011554972330729166, "learning_rate": 0.0001, "loss": 4.2577, "loss/crossentropy": 1.7983179092407227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18979590386152267, "step": 6938 }, { "epoch": 0.1388, "grad_norm": 2.296875, "grad_norm_var": 0.023164876302083335, "learning_rate": 0.0001, "loss": 4.3314, "loss/crossentropy": 2.1919915080070496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22652295976877213, "step": 6940 }, { "epoch": 0.13884, "grad_norm": 2.21875, "grad_norm_var": 0.023152669270833332, "learning_rate": 0.0001, "loss": 4.494, "loss/crossentropy": 2.0362821221351624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24296525120735168, "step": 6942 }, { "epoch": 0.13888, "grad_norm": 2.21875, "grad_norm_var": 0.023653157552083335, "learning_rate": 0.0001, "loss": 4.4135, "loss/crossentropy": 2.0371538400650024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217850610613823, "step": 6944 }, { "epoch": 0.13892, "grad_norm": 2.5, "grad_norm_var": 0.0268310546875, "learning_rate": 0.0001, "loss": 4.3371, "loss/crossentropy": 1.9137988686561584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218162938952446, "step": 6946 }, { "epoch": 0.13896, "grad_norm": 2.46875, "grad_norm_var": 0.03438898722330729, "learning_rate": 0.0001, "loss": 4.6521, "loss/crossentropy": 2.3215843439102173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576482892036438, "step": 6948 }, { "epoch": 0.139, "grad_norm": 2.171875, "grad_norm_var": 0.026590983072916668, "learning_rate": 0.0001, "loss": 4.6004, "loss/crossentropy": 2.169154405593872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24629274010658264, "step": 6950 }, { "epoch": 0.13904, "grad_norm": 2.203125, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 4.3549, "loss/crossentropy": 2.1355135440826416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316850870847702, "step": 6952 }, { "epoch": 0.13908, "grad_norm": 2.140625, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 4.3462, "loss/crossentropy": 2.264985144138336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23467915505170822, "step": 6954 }, { "epoch": 0.13912, "grad_norm": 2.203125, "grad_norm_var": 0.025926717122395835, "learning_rate": 0.0001, "loss": 4.6559, "loss/crossentropy": 2.1007654666900635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2376948595046997, "step": 6956 }, { "epoch": 0.13916, "grad_norm": 2.28125, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 4.4871, "loss/crossentropy": 2.284608840942383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24811238050460815, "step": 6958 }, { "epoch": 0.1392, "grad_norm": 2.125, "grad_norm_var": 0.025911458333333335, "learning_rate": 0.0001, "loss": 4.2338, "loss/crossentropy": 1.657732367515564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19541934877634048, "step": 6960 }, { "epoch": 0.13924, "grad_norm": 2.171875, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 4.3832, "loss/crossentropy": 1.9607325792312622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2269211858510971, "step": 6962 }, { "epoch": 0.13928, "grad_norm": 2.09375, "grad_norm_var": 0.006571451822916667, "learning_rate": 0.0001, "loss": 4.3759, "loss/crossentropy": 1.7454752326011658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21780573576688766, "step": 6964 }, { "epoch": 0.13932, "grad_norm": 2.421875, "grad_norm_var": 0.010350545247395834, "learning_rate": 0.0001, "loss": 4.7664, "loss/crossentropy": 2.001866638660431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24929454922676086, "step": 6966 }, { "epoch": 0.13936, "grad_norm": 2.21875, "grad_norm_var": 0.008006795247395834, "learning_rate": 0.0001, "loss": 4.4181, "loss/crossentropy": 1.9167855978012085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22114858031272888, "step": 6968 }, { "epoch": 0.1394, "grad_norm": 2.15625, "grad_norm_var": 0.008003743489583333, "learning_rate": 0.0001, "loss": 4.2284, "loss/crossentropy": 2.0324739813804626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271320760250092, "step": 6970 }, { "epoch": 0.13944, "grad_norm": 2.25, "grad_norm_var": 0.016307576497395834, "learning_rate": 0.0001, "loss": 4.5375, "loss/crossentropy": 2.162013590335846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832238674163818, "step": 6972 }, { "epoch": 0.13948, "grad_norm": 2.203125, "grad_norm_var": 0.018944295247395833, "learning_rate": 0.0001, "loss": 4.1406, "loss/crossentropy": 2.074672818183899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2402098923921585, "step": 6974 }, { "epoch": 0.13952, "grad_norm": 2.0625, "grad_norm_var": 0.019758097330729165, "learning_rate": 0.0001, "loss": 4.4221, "loss/crossentropy": 1.9982299208641052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21126113086938858, "step": 6976 }, { "epoch": 0.13956, "grad_norm": 2.171875, "grad_norm_var": 0.021751912434895833, "learning_rate": 0.0001, "loss": 4.3391, "loss/crossentropy": 1.944049894809723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2296794354915619, "step": 6978 }, { "epoch": 0.1396, "grad_norm": 2.21875, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 4.4344, "loss/crossentropy": 2.308506488800049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2589666247367859, "step": 6980 }, { "epoch": 0.13964, "grad_norm": 2.296875, "grad_norm_var": 0.017724609375, "learning_rate": 0.0001, "loss": 4.2867, "loss/crossentropy": 2.129163682460785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21905581653118134, "step": 6982 }, { "epoch": 0.13968, "grad_norm": 2.234375, "grad_norm_var": 0.017902628580729166, "learning_rate": 0.0001, "loss": 4.3023, "loss/crossentropy": 1.8560669422149658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119361013174057, "step": 6984 }, { "epoch": 0.13972, "grad_norm": 2.140625, "grad_norm_var": 0.018602498372395835, "learning_rate": 0.0001, "loss": 4.1212, "loss/crossentropy": 1.8194095492362976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20872193574905396, "step": 6986 }, { "epoch": 0.13976, "grad_norm": 2.15625, "grad_norm_var": 0.005501302083333334, "learning_rate": 0.0001, "loss": 4.2612, "loss/crossentropy": 2.0200153589248657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20160746574401855, "step": 6988 }, { "epoch": 0.1398, "grad_norm": 2.359375, "grad_norm_var": 0.007298787434895833, "learning_rate": 0.0001, "loss": 4.2757, "loss/crossentropy": 1.982479751110077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23396103084087372, "step": 6990 }, { "epoch": 0.13984, "grad_norm": 2.171875, "grad_norm_var": 0.006494140625, "learning_rate": 0.0001, "loss": 4.5075, "loss/crossentropy": 2.17054283618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23827539384365082, "step": 6992 }, { "epoch": 0.13988, "grad_norm": 2.171875, "grad_norm_var": 0.0056640625, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 1.619499921798706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917721927165985, "step": 6994 }, { "epoch": 0.13992, "grad_norm": 2.15625, "grad_norm_var": 0.005353800455729167, "learning_rate": 0.0001, "loss": 4.3833, "loss/crossentropy": 2.1082500219345093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23768241703510284, "step": 6996 }, { "epoch": 0.13996, "grad_norm": 2.1875, "grad_norm_var": 0.004076131184895833, "learning_rate": 0.0001, "loss": 4.6731, "loss/crossentropy": 1.8480825424194336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21046122163534164, "step": 6998 }, { "epoch": 0.14, "grad_norm": 2.4375, "grad_norm_var": 0.008463541666666666, "learning_rate": 0.0001, "loss": 4.5285, "loss/crossentropy": 2.0547631978988647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22980494797229767, "step": 7000 }, { "epoch": 0.14004, "grad_norm": 2.15625, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 4.4625, "loss/crossentropy": 2.0695141553878784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100546732544899, "step": 7002 }, { "epoch": 0.14008, "grad_norm": 1.9921875, "grad_norm_var": 0.012277984619140625, "learning_rate": 0.0001, "loss": 4.3716, "loss/crossentropy": 2.105263113975525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24552470445632935, "step": 7004 }, { "epoch": 0.14012, "grad_norm": 2.375, "grad_norm_var": 0.014427693684895833, "learning_rate": 0.0001, "loss": 4.3566, "loss/crossentropy": 2.03000670671463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24780434370040894, "step": 7006 }, { "epoch": 0.14016, "grad_norm": 2.40625, "grad_norm_var": 0.0173980712890625, "learning_rate": 0.0001, "loss": 4.4758, "loss/crossentropy": 2.288944959640503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2760937511920929, "step": 7008 }, { "epoch": 0.1402, "grad_norm": 1.9609375, "grad_norm_var": 0.06544570922851563, "learning_rate": 0.0001, "loss": 3.9326, "loss/crossentropy": 1.790147304534912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905786320567131, "step": 7010 }, { "epoch": 0.14024, "grad_norm": 2.609375, "grad_norm_var": 0.07765884399414062, "learning_rate": 0.0001, "loss": 4.6364, "loss/crossentropy": 2.008346378803253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23138166218996048, "step": 7012 }, { "epoch": 0.14028, "grad_norm": 2.1875, "grad_norm_var": 0.07974014282226563, "learning_rate": 0.0001, "loss": 4.2304, "loss/crossentropy": 1.9694496393203735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21347863972187042, "step": 7014 }, { "epoch": 0.14032, "grad_norm": 2.203125, "grad_norm_var": 0.07948989868164062, "learning_rate": 0.0001, "loss": 4.3685, "loss/crossentropy": 2.0907286405563354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23349857330322266, "step": 7016 }, { "epoch": 0.14036, "grad_norm": 2.40625, "grad_norm_var": 0.07850316365559896, "learning_rate": 0.0001, "loss": 4.6454, "loss/crossentropy": 2.161414623260498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23841089010238647, "step": 7018 }, { "epoch": 0.1404, "grad_norm": 2.25, "grad_norm_var": 0.07315266927083333, "learning_rate": 0.0001, "loss": 4.4868, "loss/crossentropy": 2.1402887105941772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25493840873241425, "step": 7020 }, { "epoch": 0.14044, "grad_norm": 2.234375, "grad_norm_var": 0.06809666951497396, "learning_rate": 0.0001, "loss": 4.1248, "loss/crossentropy": 2.0703811049461365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24300381541252136, "step": 7022 }, { "epoch": 0.14048, "grad_norm": 2.53125, "grad_norm_var": 0.0716875712076823, "learning_rate": 0.0001, "loss": 4.6165, "loss/crossentropy": 2.152569532394409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23347805440425873, "step": 7024 }, { "epoch": 0.14052, "grad_norm": 2.171875, "grad_norm_var": 0.03453369140625, "learning_rate": 0.0001, "loss": 4.1642, "loss/crossentropy": 2.0831095576286316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198955938220024, "step": 7026 }, { "epoch": 0.14056, "grad_norm": 2.234375, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 4.4929, "loss/crossentropy": 2.1631508469581604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23971489816904068, "step": 7028 }, { "epoch": 0.1406, "grad_norm": 2.421875, "grad_norm_var": 0.019481404622395834, "learning_rate": 0.0001, "loss": 4.4687, "loss/crossentropy": 2.1683043241500854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2628757208585739, "step": 7030 }, { "epoch": 0.14064, "grad_norm": 2.1875, "grad_norm_var": 0.07377827962239583, "learning_rate": 0.0001, "loss": 4.522, "loss/crossentropy": 2.021001398563385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22922180593013763, "step": 7032 }, { "epoch": 0.14068, "grad_norm": 2.265625, "grad_norm_var": 0.0725494384765625, "learning_rate": 0.0001, "loss": 4.7729, "loss/crossentropy": 2.267430543899536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27525072544813156, "step": 7034 }, { "epoch": 0.14072, "grad_norm": 2.0625, "grad_norm_var": 0.0788726806640625, "learning_rate": 0.0001, "loss": 4.3118, "loss/crossentropy": 2.066729426383972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959475427865982, "step": 7036 }, { "epoch": 0.14076, "grad_norm": 2.0625, "grad_norm_var": 0.08162333170572916, "learning_rate": 0.0001, "loss": 4.3203, "loss/crossentropy": 1.7972697019577026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773608028888702, "step": 7038 }, { "epoch": 0.1408, "grad_norm": 2.171875, "grad_norm_var": 0.08068745930989583, "learning_rate": 0.0001, "loss": 4.0745, "loss/crossentropy": 1.751904845237732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19929176568984985, "step": 7040 }, { "epoch": 0.14084, "grad_norm": 2.171875, "grad_norm_var": 0.07649332682291667, "learning_rate": 0.0001, "loss": 4.4188, "loss/crossentropy": 1.8432873487472534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21235650032758713, "step": 7042 }, { "epoch": 0.14088, "grad_norm": 2.15625, "grad_norm_var": 0.07618815104166667, "learning_rate": 0.0001, "loss": 4.2343, "loss/crossentropy": 1.9589285850524902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21506928652524948, "step": 7044 }, { "epoch": 0.14092, "grad_norm": 2.046875, "grad_norm_var": 0.0788726806640625, "learning_rate": 0.0001, "loss": 4.229, "loss/crossentropy": 2.3658028841018677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23405643552541733, "step": 7046 }, { "epoch": 0.14096, "grad_norm": 2.140625, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 4.3922, "loss/crossentropy": 2.088135540485382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22823208570480347, "step": 7048 }, { "epoch": 0.141, "grad_norm": 2.09375, "grad_norm_var": 0.006810506184895833, "learning_rate": 0.0001, "loss": 4.2367, "loss/crossentropy": 1.9309821724891663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21409911662340164, "step": 7050 }, { "epoch": 0.14104, "grad_norm": 2.1875, "grad_norm_var": 0.006636555989583333, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 2.5411492586135864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25017087161540985, "step": 7052 }, { "epoch": 0.14108, "grad_norm": 2.03125, "grad_norm_var": 0.007352701822916667, "learning_rate": 0.0001, "loss": 4.4998, "loss/crossentropy": 2.3210322856903076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23967693746089935, "step": 7054 }, { "epoch": 0.14112, "grad_norm": 2.046875, "grad_norm_var": 0.009370930989583333, "learning_rate": 0.0001, "loss": 4.4607, "loss/crossentropy": 2.054674744606018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23270255327224731, "step": 7056 }, { "epoch": 0.14116, "grad_norm": 2.125, "grad_norm_var": 0.008968098958333334, "learning_rate": 0.0001, "loss": 4.5423, "loss/crossentropy": 2.545789122581482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25560182332992554, "step": 7058 }, { "epoch": 0.1412, "grad_norm": 2.171875, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.2682, "loss/crossentropy": 2.262348175048828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24763159453868866, "step": 7060 }, { "epoch": 0.14124, "grad_norm": 2.1875, "grad_norm_var": 0.008447265625, "learning_rate": 0.0001, "loss": 4.618, "loss/crossentropy": 2.1045475602149963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2472379505634308, "step": 7062 }, { "epoch": 0.14128, "grad_norm": 2.046875, "grad_norm_var": 0.0093170166015625, "learning_rate": 0.0001, "loss": 4.2312, "loss/crossentropy": 1.5632115006446838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19010942429304123, "step": 7064 }, { "epoch": 0.14132, "grad_norm": 2.171875, "grad_norm_var": 0.0116851806640625, "learning_rate": 0.0001, "loss": 4.2638, "loss/crossentropy": 2.0847875475883484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21371345967054367, "step": 7066 }, { "epoch": 0.14136, "grad_norm": 2.25, "grad_norm_var": 0.018016560872395834, "learning_rate": 0.0001, "loss": 4.5171, "loss/crossentropy": 2.2243804931640625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23065787553787231, "step": 7068 }, { "epoch": 0.1414, "grad_norm": 2.171875, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 4.2812, "loss/crossentropy": 1.9477753639221191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616356819868088, "step": 7070 }, { "epoch": 0.14144, "grad_norm": 2.5625, "grad_norm_var": 0.024169921875, "learning_rate": 0.0001, "loss": 4.7005, "loss/crossentropy": 2.2598072290420532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25795431435108185, "step": 7072 }, { "epoch": 0.14148, "grad_norm": 2.171875, "grad_norm_var": 0.022652180989583333, "learning_rate": 0.0001, "loss": 4.4345, "loss/crossentropy": 1.8817242980003357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21974974125623703, "step": 7074 }, { "epoch": 0.14152, "grad_norm": 2.0, "grad_norm_var": 0.0256744384765625, "learning_rate": 0.0001, "loss": 4.5688, "loss/crossentropy": 2.5275847911834717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24608048796653748, "step": 7076 }, { "epoch": 0.14156, "grad_norm": 2.125, "grad_norm_var": 0.0265045166015625, "learning_rate": 0.0001, "loss": 4.3937, "loss/crossentropy": 2.400865852832794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22960034757852554, "step": 7078 }, { "epoch": 0.1416, "grad_norm": 2.140625, "grad_norm_var": 0.0237457275390625, "learning_rate": 0.0001, "loss": 4.4005, "loss/crossentropy": 1.908901333808899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22271078824996948, "step": 7080 }, { "epoch": 0.14164, "grad_norm": 2.171875, "grad_norm_var": 0.02213134765625, "learning_rate": 0.0001, "loss": 4.1384, "loss/crossentropy": 2.3330780267715454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23944097012281418, "step": 7082 }, { "epoch": 0.14168, "grad_norm": 2.140625, "grad_norm_var": 0.018941243489583332, "learning_rate": 0.0001, "loss": 4.3516, "loss/crossentropy": 2.332213521003723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23576530069112778, "step": 7084 }, { "epoch": 0.14172, "grad_norm": 2.125, "grad_norm_var": 0.022435506184895832, "learning_rate": 0.0001, "loss": 4.4654, "loss/crossentropy": 2.2269067764282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25341375917196274, "step": 7086 }, { "epoch": 0.14176, "grad_norm": 2.203125, "grad_norm_var": 0.01470947265625, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 2.461983561515808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24491792172193527, "step": 7088 }, { "epoch": 0.1418, "grad_norm": 1.9765625, "grad_norm_var": 0.013952382405598958, "learning_rate": 0.0001, "loss": 4.2348, "loss/crossentropy": 2.428719997406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2394469603896141, "step": 7090 }, { "epoch": 0.14184, "grad_norm": 2.109375, "grad_norm_var": 0.011433664957682292, "learning_rate": 0.0001, "loss": 4.2242, "loss/crossentropy": 2.32351291179657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23441501706838608, "step": 7092 }, { "epoch": 0.14188, "grad_norm": 2.171875, "grad_norm_var": 0.011482493082682291, "learning_rate": 0.0001, "loss": 4.4155, "loss/crossentropy": 2.1165764331817627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2350979596376419, "step": 7094 }, { "epoch": 0.14192, "grad_norm": 2.3125, "grad_norm_var": 0.014288075764973958, "learning_rate": 0.0001, "loss": 4.4952, "loss/crossentropy": 2.150681734085083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2475891336798668, "step": 7096 }, { "epoch": 0.14196, "grad_norm": 2.046875, "grad_norm_var": 0.013079579671223958, "learning_rate": 0.0001, "loss": 4.2753, "loss/crossentropy": 2.038177013397217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2293965071439743, "step": 7098 }, { "epoch": 0.142, "grad_norm": 2.0625, "grad_norm_var": 0.014062245686848959, "learning_rate": 0.0001, "loss": 4.298, "loss/crossentropy": 1.899521827697754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20856370776891708, "step": 7100 }, { "epoch": 0.14204, "grad_norm": 2.296875, "grad_norm_var": 0.010593414306640625, "learning_rate": 0.0001, "loss": 4.3449, "loss/crossentropy": 1.9807924032211304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22756796330213547, "step": 7102 }, { "epoch": 0.14208, "grad_norm": 2.0625, "grad_norm_var": 0.009421539306640626, "learning_rate": 0.0001, "loss": 4.2893, "loss/crossentropy": 2.158667206764221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22594892233610153, "step": 7104 }, { "epoch": 0.14212, "grad_norm": 2.15625, "grad_norm_var": 0.007811482747395833, "learning_rate": 0.0001, "loss": 4.3751, "loss/crossentropy": 2.3133270144462585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22925584018230438, "step": 7106 }, { "epoch": 0.14216, "grad_norm": 2.015625, "grad_norm_var": 0.010107421875, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 1.9796301126480103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21650104224681854, "step": 7108 }, { "epoch": 0.1422, "grad_norm": 2.09375, "grad_norm_var": 0.010054524739583333, "learning_rate": 0.0001, "loss": 4.2778, "loss/crossentropy": 2.092659056186676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23407263308763504, "step": 7110 }, { "epoch": 0.14224, "grad_norm": 2.125, "grad_norm_var": 0.009056599934895833, "learning_rate": 0.0001, "loss": 4.6078, "loss/crossentropy": 1.970819890499115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262839823961258, "step": 7112 }, { "epoch": 0.14228, "grad_norm": 1.9921875, "grad_norm_var": 0.011923980712890626, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 1.5877107381820679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182430237531662, "step": 7114 }, { "epoch": 0.14232, "grad_norm": 2.21875, "grad_norm_var": 0.020336659749348958, "learning_rate": 0.0001, "loss": 4.6997, "loss/crossentropy": 2.3208755254745483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25018931180238724, "step": 7116 }, { "epoch": 0.14236, "grad_norm": 2.203125, "grad_norm_var": 0.01904271443684896, "learning_rate": 0.0001, "loss": 4.4483, "loss/crossentropy": 2.1348973512649536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23516173660755157, "step": 7118 }, { "epoch": 0.1424, "grad_norm": 2.46875, "grad_norm_var": 0.02533543904622396, "learning_rate": 0.0001, "loss": 4.3443, "loss/crossentropy": 2.1442995071411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23094037175178528, "step": 7120 }, { "epoch": 0.14244, "grad_norm": 2.1875, "grad_norm_var": 0.025608062744140625, "learning_rate": 0.0001, "loss": 4.3297, "loss/crossentropy": 2.24001145362854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479296177625656, "step": 7122 }, { "epoch": 0.14248, "grad_norm": 2.734375, "grad_norm_var": 0.038917795817057295, "learning_rate": 0.0001, "loss": 4.4122, "loss/crossentropy": 1.8284733891487122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23670874536037445, "step": 7124 }, { "epoch": 0.14252, "grad_norm": 2.375, "grad_norm_var": 0.04146499633789062, "learning_rate": 0.0001, "loss": 4.6223, "loss/crossentropy": 2.1003533601760864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24235840141773224, "step": 7126 }, { "epoch": 0.14256, "grad_norm": 2.078125, "grad_norm_var": 0.04201024373372396, "learning_rate": 0.0001, "loss": 4.2591, "loss/crossentropy": 2.3661316633224487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.258208692073822, "step": 7128 }, { "epoch": 0.1426, "grad_norm": 2.109375, "grad_norm_var": 0.030989583333333334, "learning_rate": 0.0001, "loss": 4.3288, "loss/crossentropy": 2.2374593019485474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2543400973081589, "step": 7130 }, { "epoch": 0.14264, "grad_norm": 2.15625, "grad_norm_var": 0.02945556640625, "learning_rate": 0.0001, "loss": 4.5939, "loss/crossentropy": 1.9141342639923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23257911205291748, "step": 7132 }, { "epoch": 0.14268, "grad_norm": 2.4375, "grad_norm_var": 0.031224568684895832, "learning_rate": 0.0001, "loss": 4.1893, "loss/crossentropy": 1.992666780948639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2298717424273491, "step": 7134 }, { "epoch": 0.14272, "grad_norm": 2.03125, "grad_norm_var": 0.033503214518229164, "learning_rate": 0.0001, "loss": 4.2665, "loss/crossentropy": 1.9794283509254456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112378552556038, "step": 7136 }, { "epoch": 0.14276, "grad_norm": 2.5, "grad_norm_var": 0.0382232666015625, "learning_rate": 0.0001, "loss": 4.3801, "loss/crossentropy": 2.1011139154434204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27278490364551544, "step": 7138 }, { "epoch": 0.1428, "grad_norm": 2.234375, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 4.6989, "loss/crossentropy": 2.3489880561828613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24445360153913498, "step": 7140 }, { "epoch": 0.14284, "grad_norm": 2.28125, "grad_norm_var": 0.0173736572265625, "learning_rate": 0.0001, "loss": 4.3418, "loss/crossentropy": 2.011172831058502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210151307284832, "step": 7142 }, { "epoch": 0.14288, "grad_norm": 2.25, "grad_norm_var": 0.017366536458333335, "learning_rate": 0.0001, "loss": 4.3488, "loss/crossentropy": 1.963642418384552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22939135879278183, "step": 7144 }, { "epoch": 0.14292, "grad_norm": 2.21875, "grad_norm_var": 0.016194661458333332, "learning_rate": 0.0001, "loss": 4.5166, "loss/crossentropy": 2.2739341259002686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23235367238521576, "step": 7146 }, { "epoch": 0.14296, "grad_norm": 2.21875, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 4.5977, "loss/crossentropy": 2.282576322555542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24144183099269867, "step": 7148 }, { "epoch": 0.143, "grad_norm": 2.078125, "grad_norm_var": 0.02329279581705729, "learning_rate": 0.0001, "loss": 4.141, "loss/crossentropy": 1.7847901582717896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19184929877519608, "step": 7150 }, { "epoch": 0.14304, "grad_norm": 2.15625, "grad_norm_var": 0.021740468343098958, "learning_rate": 0.0001, "loss": 4.3379, "loss/crossentropy": 2.165170907974243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223430335521698, "step": 7152 }, { "epoch": 0.14308, "grad_norm": 2.15625, "grad_norm_var": 0.018873850504557293, "learning_rate": 0.0001, "loss": 4.3407, "loss/crossentropy": 2.0395787954330444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21662698686122894, "step": 7154 }, { "epoch": 0.14312, "grad_norm": 2.125, "grad_norm_var": 0.01907323201497396, "learning_rate": 0.0001, "loss": 4.4936, "loss/crossentropy": 2.014316141605377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23184175789356232, "step": 7156 }, { "epoch": 0.14316, "grad_norm": 2.296875, "grad_norm_var": 0.019419097900390626, "learning_rate": 0.0001, "loss": 4.5612, "loss/crossentropy": 2.2581117153167725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24055174738168716, "step": 7158 }, { "epoch": 0.1432, "grad_norm": 2.28125, "grad_norm_var": 0.021022288004557292, "learning_rate": 0.0001, "loss": 4.3853, "loss/crossentropy": 2.0905630588531494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2719188630580902, "step": 7160 }, { "epoch": 0.14324, "grad_norm": 2.25, "grad_norm_var": 0.025233713785807292, "learning_rate": 0.0001, "loss": 4.6485, "loss/crossentropy": 2.414529800415039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24188701063394547, "step": 7162 }, { "epoch": 0.14328, "grad_norm": 2.328125, "grad_norm_var": 0.02240778605143229, "learning_rate": 0.0001, "loss": 4.3605, "loss/crossentropy": 2.028432607650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2300800457596779, "step": 7164 }, { "epoch": 0.14332, "grad_norm": 2.515625, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 4.2947, "loss/crossentropy": 2.096144199371338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23311930894851685, "step": 7166 }, { "epoch": 0.14336, "grad_norm": 2.25, "grad_norm_var": 0.029622395833333332, "learning_rate": 0.0001, "loss": 4.4375, "loss/crossentropy": 2.259281277656555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25558799505233765, "step": 7168 }, { "epoch": 0.1434, "grad_norm": 2.25, "grad_norm_var": 0.0219390869140625, "learning_rate": 0.0001, "loss": 4.4364, "loss/crossentropy": 2.0766254663467407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22725434601306915, "step": 7170 }, { "epoch": 0.14344, "grad_norm": 2.15625, "grad_norm_var": 0.024312337239583332, "learning_rate": 0.0001, "loss": 4.4695, "loss/crossentropy": 2.26702618598938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22584324330091476, "step": 7172 }, { "epoch": 0.14348, "grad_norm": 2.203125, "grad_norm_var": 0.027534993489583333, "learning_rate": 0.0001, "loss": 4.3417, "loss/crossentropy": 2.1933096647262573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23658160120248795, "step": 7174 }, { "epoch": 0.14352, "grad_norm": 2.1875, "grad_norm_var": 0.027372233072916665, "learning_rate": 0.0001, "loss": 4.351, "loss/crossentropy": 2.2003660202026367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22349942475557327, "step": 7176 }, { "epoch": 0.14356, "grad_norm": 2.28125, "grad_norm_var": 0.025145467122395834, "learning_rate": 0.0001, "loss": 4.684, "loss/crossentropy": 2.4630067348480225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253003865480423, "step": 7178 }, { "epoch": 0.1436, "grad_norm": 2.109375, "grad_norm_var": 0.027082316080729165, "learning_rate": 0.0001, "loss": 4.6875, "loss/crossentropy": 2.264480948448181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27604997158050537, "step": 7180 }, { "epoch": 0.14364, "grad_norm": 2.203125, "grad_norm_var": 0.00712890625, "learning_rate": 0.0001, "loss": 4.3064, "loss/crossentropy": 2.1641955375671387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22587595880031586, "step": 7182 }, { "epoch": 0.14368, "grad_norm": 2.296875, "grad_norm_var": 0.00943603515625, "learning_rate": 0.0001, "loss": 4.6137, "loss/crossentropy": 2.1432350873947144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24324018508195877, "step": 7184 }, { "epoch": 0.14372, "grad_norm": 1.9140625, "grad_norm_var": 0.014611562093098959, "learning_rate": 0.0001, "loss": 4.3394, "loss/crossentropy": 1.7448238134384155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1751260682940483, "step": 7186 }, { "epoch": 0.14376, "grad_norm": 2.1875, "grad_norm_var": 0.015295155843098958, "learning_rate": 0.0001, "loss": 4.3602, "loss/crossentropy": 2.3202184438705444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314094603061676, "step": 7188 }, { "epoch": 0.1438, "grad_norm": 1.984375, "grad_norm_var": 0.019681549072265624, "learning_rate": 0.0001, "loss": 4.187, "loss/crossentropy": 1.970094919204712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24998464435338974, "step": 7190 }, { "epoch": 0.14384, "grad_norm": 2.125, "grad_norm_var": 0.02075780232747396, "learning_rate": 0.0001, "loss": 4.215, "loss/crossentropy": 2.1331114768981934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008385732769966, "step": 7192 }, { "epoch": 0.14388, "grad_norm": 2.03125, "grad_norm_var": 0.021345774332682293, "learning_rate": 0.0001, "loss": 4.3976, "loss/crossentropy": 2.1659106016159058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22314336150884628, "step": 7194 }, { "epoch": 0.14392, "grad_norm": 2.1875, "grad_norm_var": 0.018507639567057293, "learning_rate": 0.0001, "loss": 4.6324, "loss/crossentropy": 2.3382883071899414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26393643021583557, "step": 7196 }, { "epoch": 0.14396, "grad_norm": 2.34375, "grad_norm_var": 0.020499420166015626, "learning_rate": 0.0001, "loss": 4.7988, "loss/crossentropy": 2.1325554847717285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2457222416996956, "step": 7198 }, { "epoch": 0.144, "grad_norm": 2.078125, "grad_norm_var": 0.03144709269205729, "learning_rate": 0.0001, "loss": 4.3175, "loss/crossentropy": 1.7927106022834778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20284093916416168, "step": 7200 }, { "epoch": 0.14404, "grad_norm": 2.296875, "grad_norm_var": 0.03050715128580729, "learning_rate": 0.0001, "loss": 4.1676, "loss/crossentropy": 1.9799031615257263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23475942015647888, "step": 7202 }, { "epoch": 0.14408, "grad_norm": 2.234375, "grad_norm_var": 0.02939020792643229, "learning_rate": 0.0001, "loss": 4.2627, "loss/crossentropy": 2.0590370893478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160736471414566, "step": 7204 }, { "epoch": 0.14412, "grad_norm": 2.34375, "grad_norm_var": 0.02800267537434896, "learning_rate": 0.0001, "loss": 4.4397, "loss/crossentropy": 1.9866149425506592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23413674533367157, "step": 7206 }, { "epoch": 0.14416, "grad_norm": 2.5625, "grad_norm_var": 0.03551610310872396, "learning_rate": 0.0001, "loss": 4.526, "loss/crossentropy": 2.1320748925209045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24469739198684692, "step": 7208 }, { "epoch": 0.1442, "grad_norm": 2.375, "grad_norm_var": 0.034708404541015626, "learning_rate": 0.0001, "loss": 4.5834, "loss/crossentropy": 2.225857973098755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23838083446025848, "step": 7210 }, { "epoch": 0.14424, "grad_norm": 2.078125, "grad_norm_var": 0.03794733683268229, "learning_rate": 0.0001, "loss": 3.9952, "loss/crossentropy": 1.9118528962135315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064301297068596, "step": 7212 }, { "epoch": 0.14428, "grad_norm": 2.21875, "grad_norm_var": 0.03806940714518229, "learning_rate": 0.0001, "loss": 4.182, "loss/crossentropy": 1.8142234086990356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172461450099945, "step": 7214 }, { "epoch": 0.14432, "grad_norm": 2.203125, "grad_norm_var": 0.02540868123372396, "learning_rate": 0.0001, "loss": 4.4446, "loss/crossentropy": 1.9308255910873413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2236044555902481, "step": 7216 }, { "epoch": 0.14436, "grad_norm": 2.3125, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 4.4597, "loss/crossentropy": 1.9821211695671082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110422909259796, "step": 7218 }, { "epoch": 0.1444, "grad_norm": 2.328125, "grad_norm_var": 0.023176066080729165, "learning_rate": 0.0001, "loss": 4.6732, "loss/crossentropy": 2.216045379638672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23433538526296616, "step": 7220 }, { "epoch": 0.14444, "grad_norm": 2.5, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 4.6089, "loss/crossentropy": 2.2303662300109863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2390453889966011, "step": 7222 }, { "epoch": 0.14448, "grad_norm": 2.078125, "grad_norm_var": 0.017020670572916667, "learning_rate": 0.0001, "loss": 4.0805, "loss/crossentropy": 2.2152082920074463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22196897864341736, "step": 7224 }, { "epoch": 0.14452, "grad_norm": 2.09375, "grad_norm_var": 0.014860026041666667, "learning_rate": 0.0001, "loss": 4.2843, "loss/crossentropy": 2.134513795375824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964208781719208, "step": 7226 }, { "epoch": 0.14456, "grad_norm": 2.0625, "grad_norm_var": 0.015262858072916666, "learning_rate": 0.0001, "loss": 4.0289, "loss/crossentropy": 1.6803861260414124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18217483162879944, "step": 7228 }, { "epoch": 0.1446, "grad_norm": 2.078125, "grad_norm_var": 0.0152252197265625, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 1.6597792506217957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20317095518112183, "step": 7230 }, { "epoch": 0.14464, "grad_norm": 2.015625, "grad_norm_var": 0.017704264322916666, "learning_rate": 0.0001, "loss": 4.2978, "loss/crossentropy": 1.776586651802063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21835225820541382, "step": 7232 }, { "epoch": 0.14468, "grad_norm": 2.03125, "grad_norm_var": 0.017154947916666666, "learning_rate": 0.0001, "loss": 4.1092, "loss/crossentropy": 1.7347259521484375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964067220687866, "step": 7234 }, { "epoch": 0.14472, "grad_norm": 2.046875, "grad_norm_var": 0.015380859375, "learning_rate": 0.0001, "loss": 4.2069, "loss/crossentropy": 1.79097181558609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21776803582906723, "step": 7236 }, { "epoch": 0.14476, "grad_norm": 1.984375, "grad_norm_var": 0.0069244384765625, "learning_rate": 0.0001, "loss": 4.0335, "loss/crossentropy": 2.051329553127289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21669812500476837, "step": 7238 }, { "epoch": 0.1448, "grad_norm": 2.15625, "grad_norm_var": 0.0070220947265625, "learning_rate": 0.0001, "loss": 4.3206, "loss/crossentropy": 1.965324580669403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2283879667520523, "step": 7240 }, { "epoch": 0.14484, "grad_norm": 2.296875, "grad_norm_var": 0.010692342122395834, "learning_rate": 0.0001, "loss": 4.5952, "loss/crossentropy": 2.248784363269806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24906039983034134, "step": 7242 }, { "epoch": 0.14488, "grad_norm": 2.125, "grad_norm_var": 0.010529581705729167, "learning_rate": 0.0001, "loss": 4.3321, "loss/crossentropy": 1.9946890473365784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22583268582820892, "step": 7244 }, { "epoch": 0.14492, "grad_norm": 2.0625, "grad_norm_var": 0.010660807291666666, "learning_rate": 0.0001, "loss": 4.1436, "loss/crossentropy": 2.2306413650512695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22068945318460464, "step": 7246 }, { "epoch": 0.14496, "grad_norm": 2.1875, "grad_norm_var": 0.00758056640625, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 2.110253095626831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22045104205608368, "step": 7248 }, { "epoch": 0.145, "grad_norm": 2.21875, "grad_norm_var": 0.008210245768229167, "learning_rate": 0.0001, "loss": 4.0693, "loss/crossentropy": 1.928157925605774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043258175253868, "step": 7250 }, { "epoch": 0.14504, "grad_norm": 2.0, "grad_norm_var": 0.009110514322916667, "learning_rate": 0.0001, "loss": 4.2458, "loss/crossentropy": 2.2674691677093506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23258862644433975, "step": 7252 }, { "epoch": 0.14508, "grad_norm": 2.765625, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 4.5107, "loss/crossentropy": 2.2825024127960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.263367660343647, "step": 7254 }, { "epoch": 0.14512, "grad_norm": 2.265625, "grad_norm_var": 0.040185546875, "learning_rate": 0.0001, "loss": 4.2394, "loss/crossentropy": 2.1546168327331543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23073332011699677, "step": 7256 }, { "epoch": 0.14516, "grad_norm": 2.34375, "grad_norm_var": 0.040816243489583334, "learning_rate": 0.0001, "loss": 4.5504, "loss/crossentropy": 2.0490044951438904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161820113658905, "step": 7258 }, { "epoch": 0.1452, "grad_norm": 2.234375, "grad_norm_var": 0.04038798014322917, "learning_rate": 0.0001, "loss": 4.6468, "loss/crossentropy": 2.115446150302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2537624090909958, "step": 7260 }, { "epoch": 0.14524, "grad_norm": 2.203125, "grad_norm_var": 0.22388407389322917, "learning_rate": 0.0001, "loss": 4.1457, "loss/crossentropy": 2.0302165746688843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21026766300201416, "step": 7262 }, { "epoch": 0.14528, "grad_norm": 2.171875, "grad_norm_var": 0.2228179931640625, "learning_rate": 0.0001, "loss": 4.4077, "loss/crossentropy": 2.102527379989624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2354147508740425, "step": 7264 }, { "epoch": 0.14532, "grad_norm": 2.171875, "grad_norm_var": 0.21614176432291668, "learning_rate": 0.0001, "loss": 4.1108, "loss/crossentropy": 2.0095282793045044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22679834067821503, "step": 7266 }, { "epoch": 0.14536, "grad_norm": 2.265625, "grad_norm_var": 0.20706278483072918, "learning_rate": 0.0001, "loss": 4.3849, "loss/crossentropy": 1.8988603353500366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21238050609827042, "step": 7268 }, { "epoch": 0.1454, "grad_norm": 2.203125, "grad_norm_var": 0.19975484212239583, "learning_rate": 0.0001, "loss": 4.6614, "loss/crossentropy": 2.186660885810852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24453039467334747, "step": 7270 }, { "epoch": 0.14544, "grad_norm": 2.0625, "grad_norm_var": 0.20244852701822916, "learning_rate": 0.0001, "loss": 4.1409, "loss/crossentropy": 1.8927155137062073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147793024778366, "step": 7272 }, { "epoch": 0.14548, "grad_norm": 2.046875, "grad_norm_var": 0.20608317057291667, "learning_rate": 0.0001, "loss": 4.1375, "loss/crossentropy": 1.8969642519950867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21910040825605392, "step": 7274 }, { "epoch": 0.14552, "grad_norm": 2.21875, "grad_norm_var": 0.20545247395833333, "learning_rate": 0.0001, "loss": 4.3325, "loss/crossentropy": 2.090053617954254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22576116025447845, "step": 7276 }, { "epoch": 0.14556, "grad_norm": 2.171875, "grad_norm_var": 0.010480753580729167, "learning_rate": 0.0001, "loss": 4.7263, "loss/crossentropy": 2.1606650352478027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22937766462564468, "step": 7278 }, { "epoch": 0.1456, "grad_norm": 2.015625, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 4.0924, "loss/crossentropy": 1.9946333765983582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20145532488822937, "step": 7280 }, { "epoch": 0.14564, "grad_norm": 2.296875, "grad_norm_var": 0.013834635416666666, "learning_rate": 0.0001, "loss": 4.6519, "loss/crossentropy": 2.0958545207977295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22669509798288345, "step": 7282 }, { "epoch": 0.14568, "grad_norm": 2.328125, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 4.4224, "loss/crossentropy": 2.0515894889831543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2520884945988655, "step": 7284 }, { "epoch": 0.14572, "grad_norm": 2.171875, "grad_norm_var": 0.016063435872395834, "learning_rate": 0.0001, "loss": 4.1572, "loss/crossentropy": 2.034587264060974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21523287147283554, "step": 7286 }, { "epoch": 0.14576, "grad_norm": 2.140625, "grad_norm_var": 0.016080729166666665, "learning_rate": 0.0001, "loss": 3.8649, "loss/crossentropy": 1.6578314900398254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19603776931762695, "step": 7288 }, { "epoch": 0.1458, "grad_norm": 2.109375, "grad_norm_var": 0.014167277018229167, "learning_rate": 0.0001, "loss": 4.2443, "loss/crossentropy": 2.0019100308418274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22880420833826065, "step": 7290 }, { "epoch": 0.14584, "grad_norm": 2.046875, "grad_norm_var": 0.018723297119140624, "learning_rate": 0.0001, "loss": 3.981, "loss/crossentropy": 2.068517565727234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19675085693597794, "step": 7292 }, { "epoch": 0.14588, "grad_norm": 2.203125, "grad_norm_var": 0.013734690348307292, "learning_rate": 0.0001, "loss": 4.4037, "loss/crossentropy": 2.000797212123871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20464950054883957, "step": 7294 }, { "epoch": 0.14592, "grad_norm": 2.125, "grad_norm_var": 0.013734690348307292, "learning_rate": 0.0001, "loss": 4.4132, "loss/crossentropy": 2.1914668679237366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2406519278883934, "step": 7296 }, { "epoch": 0.14596, "grad_norm": 2.390625, "grad_norm_var": 0.01685358683268229, "learning_rate": 0.0001, "loss": 4.4737, "loss/crossentropy": 2.123211979866028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.250032439827919, "step": 7298 }, { "epoch": 0.146, "grad_norm": 2.625, "grad_norm_var": 0.028507232666015625, "learning_rate": 0.0001, "loss": 4.718, "loss/crossentropy": 2.0686148405075073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22581970691680908, "step": 7300 }, { "epoch": 0.14604, "grad_norm": 2.25, "grad_norm_var": 0.029288482666015626, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 2.141907751560211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503636240959167, "step": 7302 }, { "epoch": 0.14608, "grad_norm": 2.09375, "grad_norm_var": 0.028436024983723957, "learning_rate": 0.0001, "loss": 4.1493, "loss/crossentropy": 1.6741206645965576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20490698516368866, "step": 7304 }, { "epoch": 0.14612, "grad_norm": 2.09375, "grad_norm_var": 0.027854156494140626, "learning_rate": 0.0001, "loss": 4.1259, "loss/crossentropy": 1.8094561696052551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21328043192625046, "step": 7306 }, { "epoch": 0.14616, "grad_norm": 2.109375, "grad_norm_var": 0.020970662434895832, "learning_rate": 0.0001, "loss": 4.3715, "loss/crossentropy": 2.204083800315857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24204879999160767, "step": 7308 }, { "epoch": 0.1462, "grad_norm": 2.234375, "grad_norm_var": 0.021126302083333333, "learning_rate": 0.0001, "loss": 4.2614, "loss/crossentropy": 2.0166819095611572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2219880372285843, "step": 7310 }, { "epoch": 0.14624, "grad_norm": 2.25, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 4.3768, "loss/crossentropy": 2.4667757749557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2880199924111366, "step": 7312 }, { "epoch": 0.14628, "grad_norm": 2.1875, "grad_norm_var": 0.019287109375, "learning_rate": 0.0001, "loss": 4.2005, "loss/crossentropy": 2.1497310400009155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792854368686676, "step": 7314 }, { "epoch": 0.14632, "grad_norm": 2.25, "grad_norm_var": 0.0062896728515625, "learning_rate": 0.0001, "loss": 4.3911, "loss/crossentropy": 2.179584264755249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23200812935829163, "step": 7316 }, { "epoch": 0.14636, "grad_norm": 6.0625, "grad_norm_var": 0.9582590738932292, "learning_rate": 0.0001, "loss": 4.1545, "loss/crossentropy": 1.4067250490188599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204126738011837, "step": 7318 }, { "epoch": 0.1464, "grad_norm": 2.390625, "grad_norm_var": 0.9473592122395833, "learning_rate": 0.0001, "loss": 4.2439, "loss/crossentropy": 1.9879435896873474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.247541606426239, "step": 7320 }, { "epoch": 0.14644, "grad_norm": 2.28125, "grad_norm_var": 0.9377919514973958, "learning_rate": 0.0001, "loss": 4.1795, "loss/crossentropy": 1.62649005651474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054424211382866, "step": 7322 }, { "epoch": 0.14648, "grad_norm": 2.15625, "grad_norm_var": 0.9344228108723959, "learning_rate": 0.0001, "loss": 4.1479, "loss/crossentropy": 1.9012435674667358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240126132965088, "step": 7324 }, { "epoch": 0.14652, "grad_norm": 2.03125, "grad_norm_var": 0.948193359375, "learning_rate": 0.0001, "loss": 3.6841, "loss/crossentropy": 1.8239200115203857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19691675901412964, "step": 7326 }, { "epoch": 0.14656, "grad_norm": 2.171875, "grad_norm_var": 0.94068603515625, "learning_rate": 0.0001, "loss": 4.2223, "loss/crossentropy": 1.9689037799835205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301231533288956, "step": 7328 }, { "epoch": 0.1466, "grad_norm": 2.09375, "grad_norm_var": 0.9363433837890625, "learning_rate": 0.0001, "loss": 4.3733, "loss/crossentropy": 2.1064560413360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23990632593631744, "step": 7330 }, { "epoch": 0.14664, "grad_norm": 2.109375, "grad_norm_var": 0.9397125244140625, "learning_rate": 0.0001, "loss": 4.2765, "loss/crossentropy": 2.0871987342834473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23108436167240143, "step": 7332 }, { "epoch": 0.14668, "grad_norm": 2.265625, "grad_norm_var": 0.0209869384765625, "learning_rate": 0.0001, "loss": 4.4517, "loss/crossentropy": 2.3061007857322693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24406791478395462, "step": 7334 }, { "epoch": 0.14672, "grad_norm": 2.296875, "grad_norm_var": 0.016943359375, "learning_rate": 0.0001, "loss": 4.4278, "loss/crossentropy": 1.9471244812011719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20859277993440628, "step": 7336 }, { "epoch": 0.14676, "grad_norm": 2.28125, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 4.188, "loss/crossentropy": 2.0628740191459656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25501881539821625, "step": 7338 }, { "epoch": 0.1468, "grad_norm": 1.9921875, "grad_norm_var": 0.014481353759765624, "learning_rate": 0.0001, "loss": 4.4329, "loss/crossentropy": 1.8065250515937805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20222238451242447, "step": 7340 }, { "epoch": 0.14684, "grad_norm": 2.109375, "grad_norm_var": 0.012910715738932292, "learning_rate": 0.0001, "loss": 4.5431, "loss/crossentropy": 2.134244918823242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22319403290748596, "step": 7342 }, { "epoch": 0.14688, "grad_norm": 2.109375, "grad_norm_var": 0.012359364827473959, "learning_rate": 0.0001, "loss": 4.3098, "loss/crossentropy": 2.5807924270629883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26865454018116, "step": 7344 }, { "epoch": 0.14692, "grad_norm": 2.203125, "grad_norm_var": 0.015421295166015625, "learning_rate": 0.0001, "loss": 4.684, "loss/crossentropy": 2.4128278493881226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25771988928318024, "step": 7346 }, { "epoch": 0.14696, "grad_norm": 2.046875, "grad_norm_var": 0.017561594645182293, "learning_rate": 0.0001, "loss": 4.0915, "loss/crossentropy": 1.7323983907699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146565169095993, "step": 7348 }, { "epoch": 0.147, "grad_norm": 2.125, "grad_norm_var": 0.015553538004557292, "learning_rate": 0.0001, "loss": 4.5511, "loss/crossentropy": 2.036192536354065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21692586690187454, "step": 7350 }, { "epoch": 0.14704, "grad_norm": 2.09375, "grad_norm_var": 0.012320709228515626, "learning_rate": 0.0001, "loss": 4.4173, "loss/crossentropy": 1.9586528539657593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21543999761343002, "step": 7352 }, { "epoch": 0.14708, "grad_norm": 2.234375, "grad_norm_var": 0.011805979410807292, "learning_rate": 0.0001, "loss": 4.4506, "loss/crossentropy": 2.3444113731384277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24080512672662735, "step": 7354 }, { "epoch": 0.14712, "grad_norm": 2.15625, "grad_norm_var": 0.0097076416015625, "learning_rate": 0.0001, "loss": 4.4865, "loss/crossentropy": 2.3060439825057983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24567800760269165, "step": 7356 }, { "epoch": 0.14716, "grad_norm": 2.359375, "grad_norm_var": 0.01279296875, "learning_rate": 0.0001, "loss": 4.5975, "loss/crossentropy": 2.2267106771469116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345375493168831, "step": 7358 }, { "epoch": 0.1472, "grad_norm": 2.359375, "grad_norm_var": 0.014481608072916667, "learning_rate": 0.0001, "loss": 4.7052, "loss/crossentropy": 2.2470518350601196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2417411357164383, "step": 7360 }, { "epoch": 0.14724, "grad_norm": 1.9765625, "grad_norm_var": 0.012953440348307291, "learning_rate": 0.0001, "loss": 4.3299, "loss/crossentropy": 2.200543165206909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24434109032154083, "step": 7362 }, { "epoch": 0.14728, "grad_norm": 2.203125, "grad_norm_var": 0.010227203369140625, "learning_rate": 0.0001, "loss": 4.3282, "loss/crossentropy": 1.995844304561615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591491788625717, "step": 7364 }, { "epoch": 0.14732, "grad_norm": 2.046875, "grad_norm_var": 0.011580149332682291, "learning_rate": 0.0001, "loss": 4.3354, "loss/crossentropy": 1.9180658459663391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22776676714420319, "step": 7366 }, { "epoch": 0.14736, "grad_norm": 2.078125, "grad_norm_var": 0.011840565999348959, "learning_rate": 0.0001, "loss": 4.3662, "loss/crossentropy": 2.473931312561035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26284685730934143, "step": 7368 }, { "epoch": 0.1474, "grad_norm": 2.109375, "grad_norm_var": 0.011744944254557292, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.0392738580703735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22323766350746155, "step": 7370 }, { "epoch": 0.14744, "grad_norm": 2.203125, "grad_norm_var": 0.012094879150390625, "learning_rate": 0.0001, "loss": 4.3032, "loss/crossentropy": 1.9847410917282104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22324485331773758, "step": 7372 }, { "epoch": 0.14748, "grad_norm": 2.171875, "grad_norm_var": 0.008459218343098958, "learning_rate": 0.0001, "loss": 4.2794, "loss/crossentropy": 1.930326521396637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24155349284410477, "step": 7374 }, { "epoch": 0.14752, "grad_norm": 2.09375, "grad_norm_var": 0.0061724344889322914, "learning_rate": 0.0001, "loss": 4.0648, "loss/crossentropy": 1.825449824333191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087552770972252, "step": 7376 }, { "epoch": 0.14756, "grad_norm": 2.15625, "grad_norm_var": 0.004325358072916666, "learning_rate": 0.0001, "loss": 4.2405, "loss/crossentropy": 2.156645655632019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519562169909477, "step": 7378 }, { "epoch": 0.1476, "grad_norm": 2.40625, "grad_norm_var": 0.008333333333333333, "learning_rate": 0.0001, "loss": 4.5316, "loss/crossentropy": 2.255813479423523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23737946152687073, "step": 7380 }, { "epoch": 0.14764, "grad_norm": 2.109375, "grad_norm_var": 0.00797119140625, "learning_rate": 0.0001, "loss": 4.5795, "loss/crossentropy": 2.47933566570282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23890959471464157, "step": 7382 }, { "epoch": 0.14768, "grad_norm": 2.09375, "grad_norm_var": 0.007710774739583333, "learning_rate": 0.0001, "loss": 4.5328, "loss/crossentropy": 2.139566659927368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22334300726652145, "step": 7384 }, { "epoch": 0.14772, "grad_norm": 2.125, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 4.1581, "loss/crossentropy": 1.6182149052619934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19345169514417648, "step": 7386 }, { "epoch": 0.14776, "grad_norm": 2.40625, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.4571, "loss/crossentropy": 1.9984004497528076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24823245406150818, "step": 7388 }, { "epoch": 0.1478, "grad_norm": 2.171875, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 4.3218, "loss/crossentropy": 2.5892586708068848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2489030361175537, "step": 7390 }, { "epoch": 0.14784, "grad_norm": 2.140625, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.6077, "loss/crossentropy": 2.3723479509353638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23511765897274017, "step": 7392 }, { "epoch": 0.14788, "grad_norm": 2.484375, "grad_norm_var": 0.0145660400390625, "learning_rate": 0.0001, "loss": 4.3902, "loss/crossentropy": 2.001940071582794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22256013005971909, "step": 7394 }, { "epoch": 0.14792, "grad_norm": 2.0625, "grad_norm_var": 0.013850911458333334, "learning_rate": 0.0001, "loss": 4.4422, "loss/crossentropy": 2.2255555391311646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23905682563781738, "step": 7396 }, { "epoch": 0.14796, "grad_norm": 2.28125, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 4.6217, "loss/crossentropy": 2.5670583248138428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25223178416490555, "step": 7398 }, { "epoch": 0.148, "grad_norm": 2.203125, "grad_norm_var": 2.3002278645833334, "learning_rate": 0.0001, "loss": 4.6365, "loss/crossentropy": 2.16109037399292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2601509317755699, "step": 7400 }, { "epoch": 0.14804, "grad_norm": 2.09375, "grad_norm_var": 2.3012847900390625, "learning_rate": 0.0001, "loss": 4.4169, "loss/crossentropy": 2.1484315395355225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474219858646393, "step": 7402 }, { "epoch": 0.14808, "grad_norm": 4.3125, "grad_norm_var": 2.48385009765625, "learning_rate": 0.0001, "loss": 4.7452, "loss/crossentropy": 2.183099091053009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25882233679294586, "step": 7404 }, { "epoch": 0.14812, "grad_norm": 2.0625, "grad_norm_var": 2.491097005208333, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 1.795831561088562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20870305597782135, "step": 7406 }, { "epoch": 0.14816, "grad_norm": 2.09375, "grad_norm_var": 2.5058553059895834, "learning_rate": 0.0001, "loss": 4.3433, "loss/crossentropy": 2.202280640602112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150556966662407, "step": 7408 }, { "epoch": 0.1482, "grad_norm": 2.03125, "grad_norm_var": 2.5274251302083335, "learning_rate": 0.0001, "loss": 4.3063, "loss/crossentropy": 2.2616937160491943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24934212118387222, "step": 7410 }, { "epoch": 0.14824, "grad_norm": 2.171875, "grad_norm_var": 2.51627197265625, "learning_rate": 0.0001, "loss": 4.3303, "loss/crossentropy": 1.909091055393219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23191271722316742, "step": 7412 }, { "epoch": 0.14828, "grad_norm": 2.09375, "grad_norm_var": 2.5269765218098956, "learning_rate": 0.0001, "loss": 4.4224, "loss/crossentropy": 2.1383588314056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22893162816762924, "step": 7414 }, { "epoch": 0.14832, "grad_norm": 2.234375, "grad_norm_var": 0.2983062744140625, "learning_rate": 0.0001, "loss": 4.3664, "loss/crossentropy": 2.0628907680511475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23075110465288162, "step": 7416 }, { "epoch": 0.14836, "grad_norm": 2.15625, "grad_norm_var": 0.29704488118489586, "learning_rate": 0.0001, "loss": 4.2109, "loss/crossentropy": 1.8996745347976685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21171879768371582, "step": 7418 }, { "epoch": 0.1484, "grad_norm": 2.046875, "grad_norm_var": 0.0032867431640625, "learning_rate": 0.0001, "loss": 4.4241, "loss/crossentropy": 2.2367645502090454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2563689202070236, "step": 7420 }, { "epoch": 0.14844, "grad_norm": 2.265625, "grad_norm_var": 0.003934733072916667, "learning_rate": 0.0001, "loss": 4.2326, "loss/crossentropy": 2.274489164352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2462785318493843, "step": 7422 }, { "epoch": 0.14848, "grad_norm": 2.328125, "grad_norm_var": 0.00562744140625, "learning_rate": 0.0001, "loss": 4.5288, "loss/crossentropy": 2.227620482444763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24729500710964203, "step": 7424 }, { "epoch": 0.14852, "grad_norm": 2.0, "grad_norm_var": 0.006864420572916667, "learning_rate": 0.0001, "loss": 4.0913, "loss/crossentropy": 1.9911785125732422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216292105615139, "step": 7426 }, { "epoch": 0.14856, "grad_norm": 2.15625, "grad_norm_var": 0.008199055989583334, "learning_rate": 0.0001, "loss": 4.4071, "loss/crossentropy": 1.8715736865997314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2289058193564415, "step": 7428 }, { "epoch": 0.1486, "grad_norm": 2.125, "grad_norm_var": 0.007991536458333334, "learning_rate": 0.0001, "loss": 4.4671, "loss/crossentropy": 2.0192378759384155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327374964952469, "step": 7430 }, { "epoch": 0.14864, "grad_norm": 2.046875, "grad_norm_var": 0.008226521809895833, "learning_rate": 0.0001, "loss": 4.213, "loss/crossentropy": 1.9407023191452026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22586089372634888, "step": 7432 }, { "epoch": 0.14868, "grad_norm": 2.203125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.7816, "loss/crossentropy": 2.6349592208862305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2605717331171036, "step": 7434 }, { "epoch": 0.14872, "grad_norm": 2.28125, "grad_norm_var": 0.009989420572916666, "learning_rate": 0.0001, "loss": 4.5511, "loss/crossentropy": 2.3015077114105225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2292959839105606, "step": 7436 }, { "epoch": 0.14876, "grad_norm": 2.1875, "grad_norm_var": 0.013231404622395833, "learning_rate": 0.0001, "loss": 4.1663, "loss/crossentropy": 2.081148624420166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21435046195983887, "step": 7438 }, { "epoch": 0.1488, "grad_norm": 2.109375, "grad_norm_var": 0.011735026041666667, "learning_rate": 0.0001, "loss": 4.4342, "loss/crossentropy": 2.235422372817993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23794984817504883, "step": 7440 }, { "epoch": 0.14884, "grad_norm": 2.171875, "grad_norm_var": 0.011295572916666666, "learning_rate": 0.0001, "loss": 4.4595, "loss/crossentropy": 2.1423263549804688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200494110584259, "step": 7442 }, { "epoch": 0.14888, "grad_norm": 2.421875, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 4.514, "loss/crossentropy": 2.3156672716140747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25282321125268936, "step": 7444 }, { "epoch": 0.14892, "grad_norm": 2.15625, "grad_norm_var": 0.020051066080729166, "learning_rate": 0.0001, "loss": 4.3107, "loss/crossentropy": 1.7777396440505981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104710191488266, "step": 7446 }, { "epoch": 0.14896, "grad_norm": 2.09375, "grad_norm_var": 0.023341623942057292, "learning_rate": 0.0001, "loss": 3.874, "loss/crossentropy": 1.8015541434288025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976182758808136, "step": 7448 }, { "epoch": 0.149, "grad_norm": 2.578125, "grad_norm_var": 0.04948298136393229, "learning_rate": 0.0001, "loss": 4.5647, "loss/crossentropy": 2.36995792388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23767317831516266, "step": 7450 }, { "epoch": 0.14904, "grad_norm": 2.1875, "grad_norm_var": 0.050142161051432294, "learning_rate": 0.0001, "loss": 4.7568, "loss/crossentropy": 2.2867971062660217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2666979879140854, "step": 7452 }, { "epoch": 0.14908, "grad_norm": 2.328125, "grad_norm_var": 0.043794504801432294, "learning_rate": 0.0001, "loss": 4.773, "loss/crossentropy": 2.247913956642151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122594192624092, "step": 7454 }, { "epoch": 0.14912, "grad_norm": 2.109375, "grad_norm_var": 0.04197362263997396, "learning_rate": 0.0001, "loss": 4.273, "loss/crossentropy": 2.0020886063575745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22448039799928665, "step": 7456 }, { "epoch": 0.14916, "grad_norm": 2.15625, "grad_norm_var": 0.043702952067057294, "learning_rate": 0.0001, "loss": 4.3139, "loss/crossentropy": 2.2290679216384888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21489766240119934, "step": 7458 }, { "epoch": 0.1492, "grad_norm": 2.609375, "grad_norm_var": 0.049344635009765624, "learning_rate": 0.0001, "loss": 4.5919, "loss/crossentropy": 2.0447877049446106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22640856355428696, "step": 7460 }, { "epoch": 0.14924, "grad_norm": 2.3125, "grad_norm_var": 0.045904286702473956, "learning_rate": 0.0001, "loss": 4.4614, "loss/crossentropy": 2.395468831062317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2395479902625084, "step": 7462 }, { "epoch": 0.14928, "grad_norm": 2.0625, "grad_norm_var": 0.04019775390625, "learning_rate": 0.0001, "loss": 4.1552, "loss/crossentropy": 2.117182433605194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22302603721618652, "step": 7464 }, { "epoch": 0.14932, "grad_norm": 2.125, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 4.4046, "loss/crossentropy": 2.206045985221863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2548774778842926, "step": 7466 }, { "epoch": 0.14936, "grad_norm": 2.046875, "grad_norm_var": 0.0194244384765625, "learning_rate": 0.0001, "loss": 4.1531, "loss/crossentropy": 1.9933450818061829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22455794364213943, "step": 7468 }, { "epoch": 0.1494, "grad_norm": 2.125, "grad_norm_var": 0.017731730143229166, "learning_rate": 0.0001, "loss": 4.2744, "loss/crossentropy": 2.216577649116516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24432705342769623, "step": 7470 }, { "epoch": 0.14944, "grad_norm": 2.25, "grad_norm_var": 0.018163045247395832, "learning_rate": 0.0001, "loss": 4.6072, "loss/crossentropy": 2.4282405376434326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2599884122610092, "step": 7472 }, { "epoch": 0.14948, "grad_norm": 2.140625, "grad_norm_var": 0.018089803059895833, "learning_rate": 0.0001, "loss": 4.2833, "loss/crossentropy": 2.2677053213119507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24509359151124954, "step": 7474 }, { "epoch": 0.14952, "grad_norm": 2.046875, "grad_norm_var": 0.007877604166666666, "learning_rate": 0.0001, "loss": 4.1271, "loss/crossentropy": 1.9610475897789001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19867657870054245, "step": 7476 }, { "epoch": 0.14956, "grad_norm": 2.4375, "grad_norm_var": 0.012565104166666667, "learning_rate": 0.0001, "loss": 4.6286, "loss/crossentropy": 2.1379209756851196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24773475527763367, "step": 7478 }, { "epoch": 0.1496, "grad_norm": 2.0625, "grad_norm_var": 0.0125152587890625, "learning_rate": 0.0001, "loss": 4.3593, "loss/crossentropy": 2.295411467552185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2417762354016304, "step": 7480 }, { "epoch": 0.14964, "grad_norm": 2.28125, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.4839, "loss/crossentropy": 2.0983279943466187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22220823168754578, "step": 7482 }, { "epoch": 0.14968, "grad_norm": 2.34375, "grad_norm_var": 0.015086873372395834, "learning_rate": 0.0001, "loss": 4.3977, "loss/crossentropy": 2.130508065223694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23387905955314636, "step": 7484 }, { "epoch": 0.14972, "grad_norm": 2.265625, "grad_norm_var": 0.015869140625, "learning_rate": 0.0001, "loss": 4.545, "loss/crossentropy": 2.0727924704551697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23198049515485764, "step": 7486 }, { "epoch": 0.14976, "grad_norm": 2.28125, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 4.3722, "loss/crossentropy": 2.1407171487808228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23349495232105255, "step": 7488 }, { "epoch": 0.1498, "grad_norm": 2.0625, "grad_norm_var": 0.016063435872395834, "learning_rate": 0.0001, "loss": 4.3483, "loss/crossentropy": 2.123879909515381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2230711579322815, "step": 7490 }, { "epoch": 0.14984, "grad_norm": 1.8984375, "grad_norm_var": 0.02194188435872396, "learning_rate": 0.0001, "loss": 3.8687, "loss/crossentropy": 2.1362847685813904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22217128425836563, "step": 7492 }, { "epoch": 0.14988, "grad_norm": 2.234375, "grad_norm_var": 0.01625544230143229, "learning_rate": 0.0001, "loss": 4.4753, "loss/crossentropy": 2.2171897292137146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.237472265958786, "step": 7494 }, { "epoch": 0.14992, "grad_norm": 2.203125, "grad_norm_var": 0.016841379801432292, "learning_rate": 0.0001, "loss": 4.2439, "loss/crossentropy": 1.9622138142585754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19700734317302704, "step": 7496 }, { "epoch": 0.14996, "grad_norm": 2.078125, "grad_norm_var": 0.01666234334309896, "learning_rate": 0.0001, "loss": 4.1296, "loss/crossentropy": 2.300232410430908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26260019838809967, "step": 7498 }, { "epoch": 0.15, "grad_norm": 2.21875, "grad_norm_var": 0.013765207926432292, "learning_rate": 0.0001, "loss": 4.4466, "loss/crossentropy": 1.9829946756362915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21770135313272476, "step": 7500 }, { "epoch": 0.15004, "grad_norm": 2.109375, "grad_norm_var": 0.010935211181640625, "learning_rate": 0.0001, "loss": 4.017, "loss/crossentropy": 1.8421878218650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20965785533189774, "step": 7502 }, { "epoch": 0.15008, "grad_norm": 2.203125, "grad_norm_var": 0.010267893473307291, "learning_rate": 0.0001, "loss": 4.3111, "loss/crossentropy": 2.0559566020965576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21752064675092697, "step": 7504 }, { "epoch": 0.15012, "grad_norm": 2.25, "grad_norm_var": 0.009834543863932291, "learning_rate": 0.0001, "loss": 4.5526, "loss/crossentropy": 1.862777590751648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026660442352295, "step": 7506 }, { "epoch": 0.15016, "grad_norm": 2.046875, "grad_norm_var": 0.0064198811848958336, "learning_rate": 0.0001, "loss": 4.0238, "loss/crossentropy": 2.340041399002075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23489895462989807, "step": 7508 }, { "epoch": 0.1502, "grad_norm": 2.21875, "grad_norm_var": 0.006224568684895833, "learning_rate": 0.0001, "loss": 4.5434, "loss/crossentropy": 2.3596150875091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23775358498096466, "step": 7510 }, { "epoch": 0.15024, "grad_norm": 2.015625, "grad_norm_var": 0.005980428059895833, "learning_rate": 0.0001, "loss": 4.4637, "loss/crossentropy": 1.883503019809723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22191710770130157, "step": 7512 }, { "epoch": 0.15028, "grad_norm": 2.09375, "grad_norm_var": 0.006723785400390625, "learning_rate": 0.0001, "loss": 4.1485, "loss/crossentropy": 2.12862491607666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2177818939089775, "step": 7514 }, { "epoch": 0.15032, "grad_norm": 2.078125, "grad_norm_var": 0.005936431884765625, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 1.9326343536376953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20991001278162003, "step": 7516 }, { "epoch": 0.15036, "grad_norm": 2.046875, "grad_norm_var": 0.008070627848307291, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 2.1532761454582214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308126464486122, "step": 7518 }, { "epoch": 0.1504, "grad_norm": 2.21875, "grad_norm_var": 0.007867177327473959, "learning_rate": 0.0001, "loss": 4.42, "loss/crossentropy": 1.876515507698059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1940685734152794, "step": 7520 }, { "epoch": 0.15044, "grad_norm": 2.28125, "grad_norm_var": 0.008937327067057292, "learning_rate": 0.0001, "loss": 4.3515, "loss/crossentropy": 2.3677611351013184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24787116795778275, "step": 7522 }, { "epoch": 0.15048, "grad_norm": 2.296875, "grad_norm_var": 0.010892486572265625, "learning_rate": 0.0001, "loss": 4.0489, "loss/crossentropy": 1.8500076532363892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19271107017993927, "step": 7524 }, { "epoch": 0.15052, "grad_norm": 2.171875, "grad_norm_var": 0.010432688395182292, "learning_rate": 0.0001, "loss": 4.3298, "loss/crossentropy": 1.8013980984687805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181655913591385, "step": 7526 }, { "epoch": 0.15056, "grad_norm": 2.09375, "grad_norm_var": 0.010361480712890624, "learning_rate": 0.0001, "loss": 4.4958, "loss/crossentropy": 2.6469568014144897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2441466525197029, "step": 7528 }, { "epoch": 0.1506, "grad_norm": 2.203125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.1317, "loss/crossentropy": 1.7992960214614868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209397256374359, "step": 7530 }, { "epoch": 0.15064, "grad_norm": 2.625, "grad_norm_var": 0.023258463541666666, "learning_rate": 0.0001, "loss": 4.9501, "loss/crossentropy": 2.1914783120155334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23764144629240036, "step": 7532 }, { "epoch": 0.15068, "grad_norm": 2.140625, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 4.4833, "loss/crossentropy": 2.0035120844841003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21483591943979263, "step": 7534 }, { "epoch": 0.15072, "grad_norm": 1.921875, "grad_norm_var": 0.03235677083333333, "learning_rate": 0.0001, "loss": 4.2674, "loss/crossentropy": 2.0218639969825745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20919294655323029, "step": 7536 }, { "epoch": 0.15076, "grad_norm": 2.015625, "grad_norm_var": 0.03219401041666667, "learning_rate": 0.0001, "loss": 4.1091, "loss/crossentropy": 2.006688416004181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21247616410255432, "step": 7538 }, { "epoch": 0.1508, "grad_norm": 2.484375, "grad_norm_var": 0.03542378743489583, "learning_rate": 0.0001, "loss": 4.5424, "loss/crossentropy": 2.1565613746643066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30109211802482605, "step": 7540 }, { "epoch": 0.15084, "grad_norm": 2.359375, "grad_norm_var": 0.03603413899739583, "learning_rate": 0.0001, "loss": 4.3621, "loss/crossentropy": 1.9676685333251953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23605409264564514, "step": 7542 }, { "epoch": 0.15088, "grad_norm": 2.34375, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 4.4178, "loss/crossentropy": 2.1150137186050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22903620451688766, "step": 7544 }, { "epoch": 0.15092, "grad_norm": 2.34375, "grad_norm_var": 0.051253255208333334, "learning_rate": 0.0001, "loss": 4.3186, "loss/crossentropy": 2.1894554495811462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24362927675247192, "step": 7546 }, { "epoch": 0.15096, "grad_norm": 2.21875, "grad_norm_var": 0.0410308837890625, "learning_rate": 0.0001, "loss": 4.4529, "loss/crossentropy": 1.9624019861221313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22628726810216904, "step": 7548 }, { "epoch": 0.151, "grad_norm": 2.0, "grad_norm_var": 0.04338277180989583, "learning_rate": 0.0001, "loss": 4.2795, "loss/crossentropy": 2.1547625064849854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295984849333763, "step": 7550 }, { "epoch": 0.15104, "grad_norm": 2.125, "grad_norm_var": 0.03857014973958333, "learning_rate": 0.0001, "loss": 4.4638, "loss/crossentropy": 2.229305863380432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2427791878581047, "step": 7552 }, { "epoch": 0.15108, "grad_norm": 2.4375, "grad_norm_var": 0.037093098958333334, "learning_rate": 0.0001, "loss": 4.7319, "loss/crossentropy": 2.4998362064361572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23718395829200745, "step": 7554 }, { "epoch": 0.15112, "grad_norm": 2.15625, "grad_norm_var": 0.033722941080729166, "learning_rate": 0.0001, "loss": 4.2118, "loss/crossentropy": 2.319428563117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26024487614631653, "step": 7556 }, { "epoch": 0.15116, "grad_norm": 2.015625, "grad_norm_var": 0.0357818603515625, "learning_rate": 0.0001, "loss": 4.2451, "loss/crossentropy": 1.861966609954834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121218591928482, "step": 7558 }, { "epoch": 0.1512, "grad_norm": 2.265625, "grad_norm_var": 0.015751139322916666, "learning_rate": 0.0001, "loss": 4.5963, "loss/crossentropy": 2.1688510179519653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341617196798325, "step": 7560 }, { "epoch": 0.15124, "grad_norm": 2.140625, "grad_norm_var": 0.013932291666666667, "learning_rate": 0.0001, "loss": 4.4808, "loss/crossentropy": 2.2833406925201416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22648481279611588, "step": 7562 }, { "epoch": 0.15128, "grad_norm": 2.203125, "grad_norm_var": 0.0138671875, "learning_rate": 0.0001, "loss": 4.4126, "loss/crossentropy": 2.224393129348755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23414459079504013, "step": 7564 }, { "epoch": 0.15132, "grad_norm": 2.25, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.253, "loss/crossentropy": 2.003828763961792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166791632771492, "step": 7566 }, { "epoch": 0.15136, "grad_norm": 2.3125, "grad_norm_var": 0.01168212890625, "learning_rate": 0.0001, "loss": 4.0904, "loss/crossentropy": 2.1813005208969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28873007744550705, "step": 7568 }, { "epoch": 0.1514, "grad_norm": 1.9921875, "grad_norm_var": 0.028696441650390626, "learning_rate": 0.0001, "loss": 4.3824, "loss/crossentropy": 2.3818061351776123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24916332960128784, "step": 7570 }, { "epoch": 0.15144, "grad_norm": 2.03125, "grad_norm_var": 0.030460357666015625, "learning_rate": 0.0001, "loss": 4.1825, "loss/crossentropy": 2.041518449783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22352956235408783, "step": 7572 }, { "epoch": 0.15148, "grad_norm": 1.9609375, "grad_norm_var": 0.0326904296875, "learning_rate": 0.0001, "loss": 3.8531, "loss/crossentropy": 1.856759488582611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20227209478616714, "step": 7574 }, { "epoch": 0.15152, "grad_norm": 2.09375, "grad_norm_var": 0.03157450358072917, "learning_rate": 0.0001, "loss": 4.237, "loss/crossentropy": 1.9612281918525696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22188737243413925, "step": 7576 }, { "epoch": 0.15156, "grad_norm": 2.015625, "grad_norm_var": 0.0337554931640625, "learning_rate": 0.0001, "loss": 4.0405, "loss/crossentropy": 1.9610649943351746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2282957062125206, "step": 7578 }, { "epoch": 0.1516, "grad_norm": 2.078125, "grad_norm_var": 0.03400472005208333, "learning_rate": 0.0001, "loss": 4.1253, "loss/crossentropy": 1.9239189624786377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22926658391952515, "step": 7580 }, { "epoch": 0.15164, "grad_norm": 2.171875, "grad_norm_var": 0.033299763997395836, "learning_rate": 0.0001, "loss": 4.548, "loss/crossentropy": 2.425737738609314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24260805547237396, "step": 7582 }, { "epoch": 0.15168, "grad_norm": 2.28125, "grad_norm_var": 0.03242085774739583, "learning_rate": 0.0001, "loss": 4.2894, "loss/crossentropy": 1.9602521061897278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23231150209903717, "step": 7584 }, { "epoch": 0.15172, "grad_norm": 2.03125, "grad_norm_var": 0.007954661051432292, "learning_rate": 0.0001, "loss": 4.3131, "loss/crossentropy": 2.3671375513076782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24326395988464355, "step": 7586 }, { "epoch": 0.15176, "grad_norm": 2.015625, "grad_norm_var": 0.007675933837890625, "learning_rate": 0.0001, "loss": 4.1297, "loss/crossentropy": 2.0128119587898254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20082567632198334, "step": 7588 }, { "epoch": 0.1518, "grad_norm": 2.265625, "grad_norm_var": 0.0085845947265625, "learning_rate": 0.0001, "loss": 4.5813, "loss/crossentropy": 2.3719125986099243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.243422269821167, "step": 7590 }, { "epoch": 0.15184, "grad_norm": 2.28125, "grad_norm_var": 0.010575358072916667, "learning_rate": 0.0001, "loss": 4.6177, "loss/crossentropy": 2.088695764541626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23578013479709625, "step": 7592 }, { "epoch": 0.15188, "grad_norm": 2.203125, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 4.3719, "loss/crossentropy": 2.151872456073761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2496410757303238, "step": 7594 }, { "epoch": 0.15192, "grad_norm": 2.265625, "grad_norm_var": 0.017281087239583333, "learning_rate": 0.0001, "loss": 4.6041, "loss/crossentropy": 2.020963430404663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25003430247306824, "step": 7596 }, { "epoch": 0.15196, "grad_norm": 2.34375, "grad_norm_var": 0.0199615478515625, "learning_rate": 0.0001, "loss": 4.4439, "loss/crossentropy": 1.9290395379066467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20771078765392303, "step": 7598 }, { "epoch": 0.152, "grad_norm": 2.09375, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 4.2461, "loss/crossentropy": 2.0420188307762146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2375432327389717, "step": 7600 }, { "epoch": 0.15204, "grad_norm": 2.21875, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 4.4291, "loss/crossentropy": 2.474969744682312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535504847764969, "step": 7602 }, { "epoch": 0.15208, "grad_norm": 2.3125, "grad_norm_var": 0.015550740559895833, "learning_rate": 0.0001, "loss": 4.3487, "loss/crossentropy": 2.177125334739685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24919381737709045, "step": 7604 }, { "epoch": 0.15212, "grad_norm": 2.3125, "grad_norm_var": 0.015510050455729167, "learning_rate": 0.0001, "loss": 4.4719, "loss/crossentropy": 2.348747491836548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22336618602275848, "step": 7606 }, { "epoch": 0.15216, "grad_norm": 2.28125, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 4.4502, "loss/crossentropy": 1.718321442604065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19830843806266785, "step": 7608 }, { "epoch": 0.1522, "grad_norm": 2.171875, "grad_norm_var": 0.015901692708333335, "learning_rate": 0.0001, "loss": 4.4505, "loss/crossentropy": 2.2954800128936768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23203244805335999, "step": 7610 }, { "epoch": 0.15224, "grad_norm": 2.0625, "grad_norm_var": 0.013570149739583334, "learning_rate": 0.0001, "loss": 4.1141, "loss/crossentropy": 1.6918454766273499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18312305957078934, "step": 7612 }, { "epoch": 0.15228, "grad_norm": 2.015625, "grad_norm_var": 0.012727864583333333, "learning_rate": 0.0001, "loss": 4.1312, "loss/crossentropy": 1.887774109840393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151220142841339, "step": 7614 }, { "epoch": 0.15232, "grad_norm": 2.125, "grad_norm_var": 0.01373291015625, "learning_rate": 0.0001, "loss": 4.1274, "loss/crossentropy": 2.0903998613357544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22486191242933273, "step": 7616 }, { "epoch": 0.15236, "grad_norm": 2.0625, "grad_norm_var": 0.016310373942057293, "learning_rate": 0.0001, "loss": 4.0659, "loss/crossentropy": 1.929358720779419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20021560043096542, "step": 7618 }, { "epoch": 0.1524, "grad_norm": 2.703125, "grad_norm_var": 0.03484064737955729, "learning_rate": 0.0001, "loss": 4.5726, "loss/crossentropy": 1.9812661409378052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21395482122898102, "step": 7620 }, { "epoch": 0.15244, "grad_norm": 2.390625, "grad_norm_var": 0.03358942667643229, "learning_rate": 0.0001, "loss": 4.15, "loss/crossentropy": 2.148552179336548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22805052995681763, "step": 7622 }, { "epoch": 0.15248, "grad_norm": 2.078125, "grad_norm_var": 0.03416926066080729, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 2.012804687023163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248745858669281, "step": 7624 }, { "epoch": 0.15252, "grad_norm": 2.046875, "grad_norm_var": 0.03463312784830729, "learning_rate": 0.0001, "loss": 4.3948, "loss/crossentropy": 2.3378156423568726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2457970678806305, "step": 7626 }, { "epoch": 0.15256, "grad_norm": 2.296875, "grad_norm_var": 0.036382802327473956, "learning_rate": 0.0001, "loss": 4.5414, "loss/crossentropy": 2.0815274119377136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22083784639835358, "step": 7628 }, { "epoch": 0.1526, "grad_norm": 1.984375, "grad_norm_var": 0.039589182535807295, "learning_rate": 0.0001, "loss": 4.2609, "loss/crossentropy": 2.172307014465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25988608598709106, "step": 7630 }, { "epoch": 0.15264, "grad_norm": 1.9921875, "grad_norm_var": 0.03921305338541667, "learning_rate": 0.0001, "loss": 4.3711, "loss/crossentropy": 1.973683476448059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20379862189292908, "step": 7632 }, { "epoch": 0.15268, "grad_norm": 1.921875, "grad_norm_var": 0.039406077067057295, "learning_rate": 0.0001, "loss": 4.3394, "loss/crossentropy": 2.175020456314087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22480874508619308, "step": 7634 }, { "epoch": 0.15272, "grad_norm": 2.1875, "grad_norm_var": 0.022304026285807292, "learning_rate": 0.0001, "loss": 4.2176, "loss/crossentropy": 2.063227415084839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2324352264404297, "step": 7636 }, { "epoch": 0.15276, "grad_norm": 2.296875, "grad_norm_var": 0.019769032796223957, "learning_rate": 0.0001, "loss": 4.4654, "loss/crossentropy": 1.9297555088996887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171614021062851, "step": 7638 }, { "epoch": 0.1528, "grad_norm": 2.296875, "grad_norm_var": 0.019421132405598958, "learning_rate": 0.0001, "loss": 4.4698, "loss/crossentropy": 1.9808542132377625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21765583008527756, "step": 7640 }, { "epoch": 0.15284, "grad_norm": 2.09375, "grad_norm_var": 0.018790435791015626, "learning_rate": 0.0001, "loss": 4.4145, "loss/crossentropy": 2.009860336780548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20963389426469803, "step": 7642 }, { "epoch": 0.15288, "grad_norm": 2.015625, "grad_norm_var": 0.017319488525390624, "learning_rate": 0.0001, "loss": 4.2649, "loss/crossentropy": 1.912703514099121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23796956986188889, "step": 7644 }, { "epoch": 0.15292, "grad_norm": 2.25, "grad_norm_var": 0.012341054280598958, "learning_rate": 0.0001, "loss": 4.4965, "loss/crossentropy": 2.2945642471313477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2621624022722244, "step": 7646 }, { "epoch": 0.15296, "grad_norm": 2.03125, "grad_norm_var": 0.011067708333333334, "learning_rate": 0.0001, "loss": 4.2573, "loss/crossentropy": 2.185902237892151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23890018463134766, "step": 7648 }, { "epoch": 0.153, "grad_norm": 2.546875, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 4.5815, "loss/crossentropy": 1.8625503778457642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206135168671608, "step": 7650 }, { "epoch": 0.15304, "grad_norm": 1.9765625, "grad_norm_var": 0.020684560139973957, "learning_rate": 0.0001, "loss": 3.9142, "loss/crossentropy": 1.8815893530845642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21166741847991943, "step": 7652 }, { "epoch": 0.15308, "grad_norm": 2.203125, "grad_norm_var": 0.020979563395182293, "learning_rate": 0.0001, "loss": 4.2277, "loss/crossentropy": 2.156697630882263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22994951903820038, "step": 7654 }, { "epoch": 0.15312, "grad_norm": 2.25, "grad_norm_var": 0.02005182902018229, "learning_rate": 0.0001, "loss": 4.366, "loss/crossentropy": 1.6800576448440552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21905134618282318, "step": 7656 }, { "epoch": 0.15316, "grad_norm": 2.046875, "grad_norm_var": 0.02061945597330729, "learning_rate": 0.0001, "loss": 4.1446, "loss/crossentropy": 1.9215145707130432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034405767917633, "step": 7658 }, { "epoch": 0.1532, "grad_norm": 2.390625, "grad_norm_var": 0.02380956013997396, "learning_rate": 0.0001, "loss": 4.6475, "loss/crossentropy": 2.153718650341034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25605182349681854, "step": 7660 }, { "epoch": 0.15324, "grad_norm": 2.203125, "grad_norm_var": 0.02466608683268229, "learning_rate": 0.0001, "loss": 4.2676, "loss/crossentropy": 1.8782889246940613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574549585580826, "step": 7662 }, { "epoch": 0.15328, "grad_norm": 2.265625, "grad_norm_var": 0.031040191650390625, "learning_rate": 0.0001, "loss": 4.2171, "loss/crossentropy": 2.003354489803314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24152260273694992, "step": 7664 }, { "epoch": 0.15332, "grad_norm": 2.203125, "grad_norm_var": 0.022299957275390626, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 2.217758059501648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24700726568698883, "step": 7666 }, { "epoch": 0.15336, "grad_norm": 2.1875, "grad_norm_var": 0.017757161458333334, "learning_rate": 0.0001, "loss": 4.3418, "loss/crossentropy": 1.934537410736084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163502648472786, "step": 7668 }, { "epoch": 0.1534, "grad_norm": 2.03125, "grad_norm_var": 0.020340983072916666, "learning_rate": 0.0001, "loss": 4.3, "loss/crossentropy": 2.007661819458008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21769292652606964, "step": 7670 }, { "epoch": 0.15344, "grad_norm": 2.09375, "grad_norm_var": 0.020653279622395833, "learning_rate": 0.0001, "loss": 4.3082, "loss/crossentropy": 2.1586949825286865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2453690618276596, "step": 7672 }, { "epoch": 0.15348, "grad_norm": 2.109375, "grad_norm_var": 0.0197174072265625, "learning_rate": 0.0001, "loss": 4.4433, "loss/crossentropy": 2.2201706171035767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22639702260494232, "step": 7674 }, { "epoch": 0.15352, "grad_norm": 2.15625, "grad_norm_var": 0.015104166666666667, "learning_rate": 0.0001, "loss": 4.2337, "loss/crossentropy": 2.059146285057068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22411910444498062, "step": 7676 }, { "epoch": 0.15356, "grad_norm": 2.0625, "grad_norm_var": 0.015397135416666667, "learning_rate": 0.0001, "loss": 4.2318, "loss/crossentropy": 1.9768954515457153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21798508614301682, "step": 7678 }, { "epoch": 0.1536, "grad_norm": 2.109375, "grad_norm_var": 0.003413899739583333, "learning_rate": 0.0001, "loss": 4.4823, "loss/crossentropy": 1.8555094003677368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19453337788581848, "step": 7680 }, { "epoch": 0.15364, "grad_norm": 2.03125, "grad_norm_var": 0.0036783854166666666, "learning_rate": 0.0001, "loss": 4.0379, "loss/crossentropy": 1.5948917865753174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17746463418006897, "step": 7682 }, { "epoch": 0.15368, "grad_norm": 2.1875, "grad_norm_var": 0.0034464518229166668, "learning_rate": 0.0001, "loss": 4.2901, "loss/crossentropy": 1.8917757868766785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103574424982071, "step": 7684 }, { "epoch": 0.15372, "grad_norm": 2.140625, "grad_norm_var": 0.0030670166015625, "learning_rate": 0.0001, "loss": 4.3326, "loss/crossentropy": 2.090232729911804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199823334813118, "step": 7686 }, { "epoch": 0.15376, "grad_norm": 2.03125, "grad_norm_var": 0.004979451497395833, "learning_rate": 0.0001, "loss": 4.3695, "loss/crossentropy": 1.8155178427696228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21044857800006866, "step": 7688 }, { "epoch": 0.1538, "grad_norm": 2.09375, "grad_norm_var": 0.0063629150390625, "learning_rate": 0.0001, "loss": 4.5003, "loss/crossentropy": 2.31532621383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437051385641098, "step": 7690 }, { "epoch": 0.15384, "grad_norm": 2.078125, "grad_norm_var": 0.0065419514973958336, "learning_rate": 0.0001, "loss": 4.3815, "loss/crossentropy": 1.9688079357147217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22686263918876648, "step": 7692 }, { "epoch": 0.15388, "grad_norm": 2.046875, "grad_norm_var": 0.010001373291015626, "learning_rate": 0.0001, "loss": 3.9937, "loss/crossentropy": 1.9029017686843872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20635761320590973, "step": 7694 }, { "epoch": 0.15392, "grad_norm": 1.9140625, "grad_norm_var": 0.010587565104166667, "learning_rate": 0.0001, "loss": 3.91, "loss/crossentropy": 1.9817028641700745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2080235257744789, "step": 7696 }, { "epoch": 0.15396, "grad_norm": 2.171875, "grad_norm_var": 0.017463175455729167, "learning_rate": 0.0001, "loss": 4.3301, "loss/crossentropy": 2.3392014503479004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26226382702589035, "step": 7698 }, { "epoch": 0.154, "grad_norm": 1.9375, "grad_norm_var": 0.021320597330729166, "learning_rate": 0.0001, "loss": 4.0381, "loss/crossentropy": 1.7265403866767883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21790936589241028, "step": 7700 }, { "epoch": 0.15404, "grad_norm": 2.203125, "grad_norm_var": 0.02276585896809896, "learning_rate": 0.0001, "loss": 4.1725, "loss/crossentropy": 2.024384081363678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21987473219633102, "step": 7702 }, { "epoch": 0.15408, "grad_norm": 2.5, "grad_norm_var": 0.03117650349934896, "learning_rate": 0.0001, "loss": 4.6702, "loss/crossentropy": 2.1840893030166626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24231631308794022, "step": 7704 }, { "epoch": 0.15412, "grad_norm": 2.171875, "grad_norm_var": 0.03001683553059896, "learning_rate": 0.0001, "loss": 4.2197, "loss/crossentropy": 2.1950928568840027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21808429062366486, "step": 7706 }, { "epoch": 0.15416, "grad_norm": 2.140625, "grad_norm_var": 0.03029352823893229, "learning_rate": 0.0001, "loss": 4.3215, "loss/crossentropy": 1.9541537165641785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22109197825193405, "step": 7708 }, { "epoch": 0.1542, "grad_norm": 2.03125, "grad_norm_var": 0.0257476806640625, "learning_rate": 0.0001, "loss": 4.1966, "loss/crossentropy": 2.0232877135276794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214762382209301, "step": 7710 }, { "epoch": 0.15424, "grad_norm": 1.9609375, "grad_norm_var": 0.023859659830729168, "learning_rate": 0.0001, "loss": 4.0012, "loss/crossentropy": 2.003768503665924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21820105612277985, "step": 7712 }, { "epoch": 0.15428, "grad_norm": 2.21875, "grad_norm_var": 0.020026652018229167, "learning_rate": 0.0001, "loss": 4.2471, "loss/crossentropy": 2.007221221923828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203196883201599, "step": 7714 }, { "epoch": 0.15432, "grad_norm": 2.09375, "grad_norm_var": 0.015900675455729166, "learning_rate": 0.0001, "loss": 4.3201, "loss/crossentropy": 2.0134615898132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916545927524567, "step": 7716 }, { "epoch": 0.15436, "grad_norm": 2.328125, "grad_norm_var": 0.016013336181640626, "learning_rate": 0.0001, "loss": 4.3821, "loss/crossentropy": 1.9012999534606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25758640468120575, "step": 7718 }, { "epoch": 0.1544, "grad_norm": 2.25, "grad_norm_var": 0.008727773030598959, "learning_rate": 0.0001, "loss": 4.1555, "loss/crossentropy": 2.074169874191284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22667942196130753, "step": 7720 }, { "epoch": 0.15444, "grad_norm": 2.21875, "grad_norm_var": 0.009059397379557292, "learning_rate": 0.0001, "loss": 4.4355, "loss/crossentropy": 2.070925295352936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20353248715400696, "step": 7722 }, { "epoch": 0.15448, "grad_norm": 2.359375, "grad_norm_var": 0.011163075764973959, "learning_rate": 0.0001, "loss": 4.5676, "loss/crossentropy": 2.289568066596985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23175117373466492, "step": 7724 }, { "epoch": 0.15452, "grad_norm": 2.15625, "grad_norm_var": 0.012827301025390625, "learning_rate": 0.0001, "loss": 4.6152, "loss/crossentropy": 2.21374249458313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23585008084774017, "step": 7726 }, { "epoch": 0.15456, "grad_norm": 2.328125, "grad_norm_var": 0.010791015625, "learning_rate": 0.0001, "loss": 4.5575, "loss/crossentropy": 2.15897136926651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24143048375844955, "step": 7728 }, { "epoch": 0.1546, "grad_norm": 2.359375, "grad_norm_var": 0.01142578125, "learning_rate": 0.0001, "loss": 4.4722, "loss/crossentropy": 2.134206771850586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24344487488269806, "step": 7730 }, { "epoch": 0.15464, "grad_norm": 2.359375, "grad_norm_var": 0.011180623372395834, "learning_rate": 0.0001, "loss": 4.5827, "loss/crossentropy": 2.3832513093948364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24878299236297607, "step": 7732 }, { "epoch": 0.15468, "grad_norm": 2.203125, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 4.0722, "loss/crossentropy": 1.917544960975647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20134451985359192, "step": 7734 }, { "epoch": 0.15472, "grad_norm": 2.40625, "grad_norm_var": 0.011767578125, "learning_rate": 0.0001, "loss": 4.4499, "loss/crossentropy": 2.1081286668777466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23577219247817993, "step": 7736 }, { "epoch": 0.15476, "grad_norm": 2.4375, "grad_norm_var": 0.0137603759765625, "learning_rate": 0.0001, "loss": 4.7942, "loss/crossentropy": 2.214662790298462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23744845390319824, "step": 7738 }, { "epoch": 0.1548, "grad_norm": 2.359375, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 4.4325, "loss/crossentropy": 1.995256781578064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21037640422582626, "step": 7740 }, { "epoch": 0.15484, "grad_norm": 1.890625, "grad_norm_var": 0.024006144205729166, "learning_rate": 0.0001, "loss": 4.0701, "loss/crossentropy": 2.2877765893936157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23090071976184845, "step": 7742 }, { "epoch": 0.15488, "grad_norm": 2.125, "grad_norm_var": 0.022652180989583333, "learning_rate": 0.0001, "loss": 4.6167, "loss/crossentropy": 2.23935329914093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24617131054401398, "step": 7744 }, { "epoch": 0.15492, "grad_norm": 2.125, "grad_norm_var": 0.023005167643229168, "learning_rate": 0.0001, "loss": 4.4886, "loss/crossentropy": 2.15006422996521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2235095053911209, "step": 7746 }, { "epoch": 0.15496, "grad_norm": 2.015625, "grad_norm_var": 0.024144490559895832, "learning_rate": 0.0001, "loss": 4.1873, "loss/crossentropy": 1.9917905926704407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21606986224651337, "step": 7748 }, { "epoch": 0.155, "grad_norm": 2.203125, "grad_norm_var": 0.025804646809895835, "learning_rate": 0.0001, "loss": 4.6347, "loss/crossentropy": 2.303179979324341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2541813999414444, "step": 7750 }, { "epoch": 0.15504, "grad_norm": 2.109375, "grad_norm_var": 0.022786458333333332, "learning_rate": 0.0001, "loss": 4.3647, "loss/crossentropy": 2.231510281562805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479737550020218, "step": 7752 }, { "epoch": 0.15508, "grad_norm": 2.25, "grad_norm_var": 0.01778132120768229, "learning_rate": 0.0001, "loss": 4.1714, "loss/crossentropy": 2.0530437231063843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20220057666301727, "step": 7754 }, { "epoch": 0.15512, "grad_norm": 2.171875, "grad_norm_var": 0.015909830729166668, "learning_rate": 0.0001, "loss": 4.1978, "loss/crossentropy": 2.0889222025871277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22389977425336838, "step": 7756 }, { "epoch": 0.15516, "grad_norm": 2.28125, "grad_norm_var": 0.0131256103515625, "learning_rate": 0.0001, "loss": 4.4748, "loss/crossentropy": 2.3807711601257324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2397037297487259, "step": 7758 }, { "epoch": 0.1552, "grad_norm": 2.171875, "grad_norm_var": 0.01297607421875, "learning_rate": 0.0001, "loss": 4.3382, "loss/crossentropy": 1.8144067525863647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19052604585886002, "step": 7760 }, { "epoch": 0.15524, "grad_norm": 2.09375, "grad_norm_var": 0.013240559895833334, "learning_rate": 0.0001, "loss": 4.5358, "loss/crossentropy": 2.295349955558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2168404459953308, "step": 7762 }, { "epoch": 0.15528, "grad_norm": 11.8125, "grad_norm_var": 5.868936920166016, "learning_rate": 0.0001, "loss": 4.1706, "loss/crossentropy": 1.7281805276870728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230300635099411, "step": 7764 }, { "epoch": 0.15532, "grad_norm": 2.375, "grad_norm_var": 5.861083730061849, "learning_rate": 0.0001, "loss": 4.3867, "loss/crossentropy": 2.2153135538101196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24419061839580536, "step": 7766 }, { "epoch": 0.15536, "grad_norm": 2.03125, "grad_norm_var": 5.890169270833334, "learning_rate": 0.0001, "loss": 4.2047, "loss/crossentropy": 2.0259060859680176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119704708456993, "step": 7768 }, { "epoch": 0.1554, "grad_norm": 2.21875, "grad_norm_var": 5.871726226806641, "learning_rate": 0.0001, "loss": 4.3405, "loss/crossentropy": 2.2399297952651978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23252833634614944, "step": 7770 }, { "epoch": 0.15544, "grad_norm": 2.109375, "grad_norm_var": 5.86380615234375, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 2.0974661111831665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085232511162758, "step": 7772 }, { "epoch": 0.15548, "grad_norm": 2.21875, "grad_norm_var": 5.861717732747396, "learning_rate": 0.0001, "loss": 4.4141, "loss/crossentropy": 2.0121108293533325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24354346096515656, "step": 7774 }, { "epoch": 0.15552, "grad_norm": 2.171875, "grad_norm_var": 5.843431599934896, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.1463273763656616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22797952592372894, "step": 7776 }, { "epoch": 0.15556, "grad_norm": 1.9140625, "grad_norm_var": 5.875705718994141, "learning_rate": 0.0001, "loss": 3.9814, "loss/crossentropy": 1.6362827122211456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18291430547833443, "step": 7778 }, { "epoch": 0.1556, "grad_norm": 2.21875, "grad_norm_var": 0.016283162434895835, "learning_rate": 0.0001, "loss": 4.3534, "loss/crossentropy": 2.4894620180130005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24631594866514206, "step": 7780 }, { "epoch": 0.15564, "grad_norm": 2.15625, "grad_norm_var": 0.0119781494140625, "learning_rate": 0.0001, "loss": 4.5799, "loss/crossentropy": 2.346967577934265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20964853465557098, "step": 7782 }, { "epoch": 0.15568, "grad_norm": 2.03125, "grad_norm_var": 0.010282135009765625, "learning_rate": 0.0001, "loss": 4.5036, "loss/crossentropy": 2.0165189504623413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22410035878419876, "step": 7784 }, { "epoch": 0.15572, "grad_norm": 2.140625, "grad_norm_var": 0.009417470296223958, "learning_rate": 0.0001, "loss": 4.0735, "loss/crossentropy": 1.7486848831176758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20792805403470993, "step": 7786 }, { "epoch": 0.15576, "grad_norm": 2.03125, "grad_norm_var": 0.009905751546223958, "learning_rate": 0.0001, "loss": 4.1321, "loss/crossentropy": 1.9615037441253662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22578728944063187, "step": 7788 }, { "epoch": 0.1558, "grad_norm": 2.09375, "grad_norm_var": 0.009069569905598958, "learning_rate": 0.0001, "loss": 4.326, "loss/crossentropy": 1.8386783003807068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21898606419563293, "step": 7790 }, { "epoch": 0.15584, "grad_norm": 2.15625, "grad_norm_var": 0.011533355712890625, "learning_rate": 0.0001, "loss": 4.6429, "loss/crossentropy": 2.1383039951324463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2322411835193634, "step": 7792 }, { "epoch": 0.15588, "grad_norm": 2.21875, "grad_norm_var": 0.008918253580729167, "learning_rate": 0.0001, "loss": 4.3568, "loss/crossentropy": 2.510488271713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24672146886587143, "step": 7794 }, { "epoch": 0.15592, "grad_norm": 2.125, "grad_norm_var": 0.009309895833333333, "learning_rate": 0.0001, "loss": 4.4328, "loss/crossentropy": 2.03993421792984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23064473271369934, "step": 7796 }, { "epoch": 0.15596, "grad_norm": 2.09375, "grad_norm_var": 0.007306925455729167, "learning_rate": 0.0001, "loss": 4.4833, "loss/crossentropy": 2.305809736251831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916814893484116, "step": 7798 }, { "epoch": 0.156, "grad_norm": 2.03125, "grad_norm_var": 0.007796223958333333, "learning_rate": 0.0001, "loss": 4.1218, "loss/crossentropy": 1.8330454230308533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20832987129688263, "step": 7800 }, { "epoch": 0.15604, "grad_norm": 2.140625, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 4.2579, "loss/crossentropy": 1.9194663166999817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22163349390029907, "step": 7802 }, { "epoch": 0.15608, "grad_norm": 2.078125, "grad_norm_var": 0.006917317708333333, "learning_rate": 0.0001, "loss": 4.2219, "loss/crossentropy": 1.798878252506256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19532181322574615, "step": 7804 }, { "epoch": 0.15612, "grad_norm": 2.140625, "grad_norm_var": 0.0067942301432291664, "learning_rate": 0.0001, "loss": 4.2949, "loss/crossentropy": 1.730432152748108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20092248916625977, "step": 7806 }, { "epoch": 0.15616, "grad_norm": 2.515625, "grad_norm_var": 0.01304931640625, "learning_rate": 0.0001, "loss": 4.3109, "loss/crossentropy": 2.1426968574523926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22185371816158295, "step": 7808 }, { "epoch": 0.1562, "grad_norm": 2.078125, "grad_norm_var": 0.012848917643229167, "learning_rate": 0.0001, "loss": 4.4308, "loss/crossentropy": 1.983969271183014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22203146666288376, "step": 7810 }, { "epoch": 0.15624, "grad_norm": 2.0625, "grad_norm_var": 0.016532389322916667, "learning_rate": 0.0001, "loss": 4.2114, "loss/crossentropy": 2.2948192954063416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23453453928232193, "step": 7812 }, { "epoch": 0.15628, "grad_norm": 2.09375, "grad_norm_var": 0.0175933837890625, "learning_rate": 0.0001, "loss": 4.287, "loss/crossentropy": 1.9190048575401306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22457829862833023, "step": 7814 }, { "epoch": 0.15632, "grad_norm": 2.171875, "grad_norm_var": 0.01529541015625, "learning_rate": 0.0001, "loss": 4.1721, "loss/crossentropy": 2.0268847346305847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21628276258707047, "step": 7816 }, { "epoch": 0.15636, "grad_norm": 2.203125, "grad_norm_var": 0.017411295572916666, "learning_rate": 0.0001, "loss": 4.3553, "loss/crossentropy": 2.0050706267356873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20108875632286072, "step": 7818 }, { "epoch": 0.1564, "grad_norm": 2.453125, "grad_norm_var": 0.024470774332682292, "learning_rate": 0.0001, "loss": 4.2059, "loss/crossentropy": 2.096635937690735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22623063623905182, "step": 7820 }, { "epoch": 0.15644, "grad_norm": 2.21875, "grad_norm_var": 0.024580637613932293, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 1.9188768863677979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088901162147522, "step": 7822 }, { "epoch": 0.15648, "grad_norm": 2.203125, "grad_norm_var": 0.018304189046223957, "learning_rate": 0.0001, "loss": 4.4062, "loss/crossentropy": 2.3639817237854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2608294039964676, "step": 7824 }, { "epoch": 0.15652, "grad_norm": 2.1875, "grad_norm_var": 0.017765045166015625, "learning_rate": 0.0001, "loss": 4.2123, "loss/crossentropy": 1.9716956615447998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23327408730983734, "step": 7826 }, { "epoch": 0.15656, "grad_norm": 2.265625, "grad_norm_var": 0.08131688435872396, "learning_rate": 0.0001, "loss": 4.1377, "loss/crossentropy": 2.004276990890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22930586338043213, "step": 7828 }, { "epoch": 0.1566, "grad_norm": 2.078125, "grad_norm_var": 0.08247858683268229, "learning_rate": 0.0001, "loss": 4.4703, "loss/crossentropy": 1.7996181845664978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21030305325984955, "step": 7830 }, { "epoch": 0.15664, "grad_norm": 2.34375, "grad_norm_var": 0.09555435180664062, "learning_rate": 0.0001, "loss": 4.7938, "loss/crossentropy": 2.192178189754486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232302725315094, "step": 7832 }, { "epoch": 0.15668, "grad_norm": 2.109375, "grad_norm_var": 0.09538345336914063, "learning_rate": 0.0001, "loss": 4.2381, "loss/crossentropy": 1.7093925476074219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19697313755750656, "step": 7834 }, { "epoch": 0.15672, "grad_norm": 2.234375, "grad_norm_var": 0.09130859375, "learning_rate": 0.0001, "loss": 4.2534, "loss/crossentropy": 1.915247917175293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003933250904083, "step": 7836 }, { "epoch": 0.15676, "grad_norm": 2.375, "grad_norm_var": 0.091162109375, "learning_rate": 0.0001, "loss": 4.3825, "loss/crossentropy": 2.188641667366028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23260314762592316, "step": 7838 }, { "epoch": 0.1568, "grad_norm": 2.109375, "grad_norm_var": 0.09501953125, "learning_rate": 0.0001, "loss": 4.7329, "loss/crossentropy": 2.3316495418548584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24601806700229645, "step": 7840 }, { "epoch": 0.15684, "grad_norm": 2.28125, "grad_norm_var": 0.0918121337890625, "learning_rate": 0.0001, "loss": 4.2934, "loss/crossentropy": 2.140946924686432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23532958328723907, "step": 7842 }, { "epoch": 0.15688, "grad_norm": 2.21875, "grad_norm_var": 0.031525675455729166, "learning_rate": 0.0001, "loss": 4.2738, "loss/crossentropy": 2.372095465660095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2615740895271301, "step": 7844 }, { "epoch": 0.15692, "grad_norm": 2.25, "grad_norm_var": 0.028727213541666668, "learning_rate": 0.0001, "loss": 4.1954, "loss/crossentropy": 1.8433185815811157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160269021987915, "step": 7846 }, { "epoch": 0.15696, "grad_norm": 2.140625, "grad_norm_var": 0.016071573893229166, "learning_rate": 0.0001, "loss": 4.2489, "loss/crossentropy": 2.012324333190918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964300200343132, "step": 7848 }, { "epoch": 0.157, "grad_norm": 2.203125, "grad_norm_var": 0.014090983072916667, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 2.0325884222984314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22080926597118378, "step": 7850 }, { "epoch": 0.15704, "grad_norm": 2.296875, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 4.3784, "loss/crossentropy": 2.3786104917526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24448946118354797, "step": 7852 }, { "epoch": 0.15708, "grad_norm": 2.21875, "grad_norm_var": 0.011693318684895834, "learning_rate": 0.0001, "loss": 4.5173, "loss/crossentropy": 2.304913640022278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24708375334739685, "step": 7854 }, { "epoch": 0.15712, "grad_norm": 2.1875, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 4.5711, "loss/crossentropy": 1.8640215396881104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19824489206075668, "step": 7856 }, { "epoch": 0.15716, "grad_norm": 2.0625, "grad_norm_var": 0.00625, "learning_rate": 0.0001, "loss": 4.6174, "loss/crossentropy": 2.444548487663269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2542492523789406, "step": 7858 }, { "epoch": 0.1572, "grad_norm": 2.078125, "grad_norm_var": 0.00592041015625, "learning_rate": 0.0001, "loss": 4.0522, "loss/crossentropy": 1.991346299648285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159758359193802, "step": 7860 }, { "epoch": 0.15724, "grad_norm": 2.15625, "grad_norm_var": 0.005101521809895833, "learning_rate": 0.0001, "loss": 4.2627, "loss/crossentropy": 2.3172048926353455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2506742626428604, "step": 7862 }, { "epoch": 0.15728, "grad_norm": 2.265625, "grad_norm_var": 0.0050201416015625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 1.9302632212638855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104542776942253, "step": 7864 }, { "epoch": 0.15732, "grad_norm": 2.125, "grad_norm_var": 0.020726521809895832, "learning_rate": 0.0001, "loss": 4.7831, "loss/crossentropy": 2.457883358001709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538483679294586, "step": 7866 }, { "epoch": 0.15736, "grad_norm": 2.125, "grad_norm_var": 0.019677734375, "learning_rate": 0.0001, "loss": 4.1912, "loss/crossentropy": 1.9857566952705383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131117358803749, "step": 7868 }, { "epoch": 0.1574, "grad_norm": 2.203125, "grad_norm_var": 0.019140625, "learning_rate": 0.0001, "loss": 4.4264, "loss/crossentropy": 2.0062127113342285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22806233912706375, "step": 7870 }, { "epoch": 0.15744, "grad_norm": 2.15625, "grad_norm_var": 0.019489542643229166, "learning_rate": 0.0001, "loss": 4.6181, "loss/crossentropy": 2.283127784729004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23883548378944397, "step": 7872 }, { "epoch": 0.15748, "grad_norm": 2.234375, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 4.4442, "loss/crossentropy": 2.110979437828064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209333136677742, "step": 7874 }, { "epoch": 0.15752, "grad_norm": 2.140625, "grad_norm_var": 0.019856770833333332, "learning_rate": 0.0001, "loss": 4.2107, "loss/crossentropy": 1.8705166578292847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20791998505592346, "step": 7876 }, { "epoch": 0.15756, "grad_norm": 2.140625, "grad_norm_var": 0.019017537434895832, "learning_rate": 0.0001, "loss": 4.2956, "loss/crossentropy": 2.133803129196167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23219672590494156, "step": 7878 }, { "epoch": 0.1576, "grad_norm": 2.21875, "grad_norm_var": 0.019627888997395832, "learning_rate": 0.0001, "loss": 4.5028, "loss/crossentropy": 2.1062549352645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266012579202652, "step": 7880 }, { "epoch": 0.15764, "grad_norm": 2.28125, "grad_norm_var": 0.0058553059895833336, "learning_rate": 0.0001, "loss": 4.5078, "loss/crossentropy": 2.0088155269622803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079932913184166, "step": 7882 }, { "epoch": 0.15768, "grad_norm": 2.234375, "grad_norm_var": 0.0065582275390625, "learning_rate": 0.0001, "loss": 3.9204, "loss/crossentropy": 1.6621176600456238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19370710104703903, "step": 7884 }, { "epoch": 0.15772, "grad_norm": 2.09375, "grad_norm_var": 0.007136027018229167, "learning_rate": 0.0001, "loss": 4.3462, "loss/crossentropy": 1.7729167938232422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18562481552362442, "step": 7886 }, { "epoch": 0.15776, "grad_norm": 2.109375, "grad_norm_var": 0.006843058268229166, "learning_rate": 0.0001, "loss": 4.3897, "loss/crossentropy": 2.132485508918762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23432063311338425, "step": 7888 }, { "epoch": 0.1578, "grad_norm": 2.328125, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.3731, "loss/crossentropy": 2.122144937515259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23042195290327072, "step": 7890 }, { "epoch": 0.15784, "grad_norm": 2.8125, "grad_norm_var": 0.034956868489583334, "learning_rate": 0.0001, "loss": 4.4025, "loss/crossentropy": 1.855578601360321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20124691724777222, "step": 7892 }, { "epoch": 0.15788, "grad_norm": 2.15625, "grad_norm_var": 0.035054524739583336, "learning_rate": 0.0001, "loss": 4.172, "loss/crossentropy": 2.02128005027771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20766886323690414, "step": 7894 }, { "epoch": 0.15792, "grad_norm": 1.984375, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 4.3096, "loss/crossentropy": 2.112824857234955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22250613570213318, "step": 7896 }, { "epoch": 0.15796, "grad_norm": 2.3125, "grad_norm_var": 0.03662821451822917, "learning_rate": 0.0001, "loss": 4.5595, "loss/crossentropy": 2.2290207147598267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2531846910715103, "step": 7898 }, { "epoch": 0.158, "grad_norm": 1.96875, "grad_norm_var": 0.03882548014322917, "learning_rate": 0.0001, "loss": 4.3586, "loss/crossentropy": 2.135373592376709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266940325498581, "step": 7900 }, { "epoch": 0.15804, "grad_norm": 2.125, "grad_norm_var": 0.04052734375, "learning_rate": 0.0001, "loss": 4.1719, "loss/crossentropy": 1.7298616170883179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986929401755333, "step": 7902 }, { "epoch": 0.15808, "grad_norm": 2.3125, "grad_norm_var": 0.04173075358072917, "learning_rate": 0.0001, "loss": 4.403, "loss/crossentropy": 2.18759286403656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21622422337532043, "step": 7904 }, { "epoch": 0.15812, "grad_norm": 2.140625, "grad_norm_var": 0.03889058430989583, "learning_rate": 0.0001, "loss": 4.4765, "loss/crossentropy": 2.0889216661453247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21440055221319199, "step": 7906 }, { "epoch": 0.15816, "grad_norm": 2.1875, "grad_norm_var": 0.010204060872395834, "learning_rate": 0.0001, "loss": 4.2231, "loss/crossentropy": 1.7791658639907837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1967781037092209, "step": 7908 }, { "epoch": 0.1582, "grad_norm": 1.953125, "grad_norm_var": 0.012386067708333334, "learning_rate": 0.0001, "loss": 4.2314, "loss/crossentropy": 2.2144845724105835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22152486443519592, "step": 7910 }, { "epoch": 0.15824, "grad_norm": 2.078125, "grad_norm_var": 0.012723795572916667, "learning_rate": 0.0001, "loss": 4.6236, "loss/crossentropy": 2.316117286682129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24549901485443115, "step": 7912 }, { "epoch": 0.15828, "grad_norm": 2.046875, "grad_norm_var": 0.0115875244140625, "learning_rate": 0.0001, "loss": 4.0867, "loss/crossentropy": 1.8642511367797852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22441796958446503, "step": 7914 }, { "epoch": 0.15832, "grad_norm": 2.078125, "grad_norm_var": 0.0098541259765625, "learning_rate": 0.0001, "loss": 4.1365, "loss/crossentropy": 1.807969868183136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22138798981904984, "step": 7916 }, { "epoch": 0.15836, "grad_norm": 2.390625, "grad_norm_var": 0.013866170247395834, "learning_rate": 0.0001, "loss": 4.5625, "loss/crossentropy": 2.266697645187378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22544697672128677, "step": 7918 }, { "epoch": 0.1584, "grad_norm": 2.203125, "grad_norm_var": 0.012262980143229166, "learning_rate": 0.0001, "loss": 4.5632, "loss/crossentropy": 2.0871587991714478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21997228264808655, "step": 7920 }, { "epoch": 0.15844, "grad_norm": 2.21875, "grad_norm_var": 0.0126861572265625, "learning_rate": 0.0001, "loss": 4.5567, "loss/crossentropy": 2.1993759870529175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2416691780090332, "step": 7922 }, { "epoch": 0.15848, "grad_norm": 2.5, "grad_norm_var": 0.020340983072916666, "learning_rate": 0.0001, "loss": 4.2997, "loss/crossentropy": 1.8039653897285461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21721573173999786, "step": 7924 }, { "epoch": 0.15852, "grad_norm": 2.3125, "grad_norm_var": 0.0225982666015625, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 2.188117265701294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231883242726326, "step": 7926 }, { "epoch": 0.15856, "grad_norm": 2.171875, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 4.6694, "loss/crossentropy": 2.3920425176620483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24655026197433472, "step": 7928 }, { "epoch": 0.1586, "grad_norm": 2.125, "grad_norm_var": 0.017561848958333334, "learning_rate": 0.0001, "loss": 4.1001, "loss/crossentropy": 2.286831498146057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23398208618164062, "step": 7930 }, { "epoch": 0.15864, "grad_norm": 2.0625, "grad_norm_var": 0.016825358072916668, "learning_rate": 0.0001, "loss": 4.0007, "loss/crossentropy": 2.075824797153473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22423933446407318, "step": 7932 }, { "epoch": 0.15868, "grad_norm": 2.25, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 4.5697, "loss/crossentropy": 2.197165012359619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22215355187654495, "step": 7934 }, { "epoch": 0.15872, "grad_norm": 2.078125, "grad_norm_var": 0.017854817708333335, "learning_rate": 0.0001, "loss": 4.2899, "loss/crossentropy": 1.8253535032272339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20457974076271057, "step": 7936 }, { "epoch": 0.15876, "grad_norm": 2.21875, "grad_norm_var": 0.01783447265625, "learning_rate": 0.0001, "loss": 4.4774, "loss/crossentropy": 1.842383086681366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20742817968130112, "step": 7938 }, { "epoch": 0.1588, "grad_norm": 2.0625, "grad_norm_var": 0.014208984375, "learning_rate": 0.0001, "loss": 4.3801, "loss/crossentropy": 2.2086315155029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086438685655594, "step": 7940 }, { "epoch": 0.15884, "grad_norm": 2.109375, "grad_norm_var": 0.0067047119140625, "learning_rate": 0.0001, "loss": 4.3718, "loss/crossentropy": 2.3381282091140747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2546956539154053, "step": 7942 }, { "epoch": 0.15888, "grad_norm": 2.265625, "grad_norm_var": 0.005248006184895833, "learning_rate": 0.0001, "loss": 4.3439, "loss/crossentropy": 2.1159931421279907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23917824029922485, "step": 7944 }, { "epoch": 0.15892, "grad_norm": 2.078125, "grad_norm_var": 0.0069081624348958336, "learning_rate": 0.0001, "loss": 4.2056, "loss/crossentropy": 1.7507159113883972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1870383694767952, "step": 7946 }, { "epoch": 0.15896, "grad_norm": 2.140625, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 4.2118, "loss/crossentropy": 2.213807225227356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22211921215057373, "step": 7948 }, { "epoch": 0.159, "grad_norm": 2.03125, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 4.4316, "loss/crossentropy": 2.0570366978645325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2662791311740875, "step": 7950 }, { "epoch": 0.15904, "grad_norm": 2.171875, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 4.2041, "loss/crossentropy": 1.8775206208229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22045740485191345, "step": 7952 }, { "epoch": 0.15908, "grad_norm": 2.046875, "grad_norm_var": 0.009601847330729166, "learning_rate": 0.0001, "loss": 4.2575, "loss/crossentropy": 2.3483108282089233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22626785188913345, "step": 7954 }, { "epoch": 0.15912, "grad_norm": 2.125, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 4.193, "loss/crossentropy": 1.9531084895133972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22194421291351318, "step": 7956 }, { "epoch": 0.15916, "grad_norm": 2.328125, "grad_norm_var": 0.011213175455729167, "learning_rate": 0.0001, "loss": 4.4819, "loss/crossentropy": 2.057813823223114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21116723865270615, "step": 7958 }, { "epoch": 0.1592, "grad_norm": 2.28125, "grad_norm_var": 0.01148681640625, "learning_rate": 0.0001, "loss": 4.3478, "loss/crossentropy": 1.9398415088653564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2257898524403572, "step": 7960 }, { "epoch": 0.15924, "grad_norm": 2.015625, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 4.1942, "loss/crossentropy": 1.9200270175933838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119666188955307, "step": 7962 }, { "epoch": 0.15928, "grad_norm": 2.203125, "grad_norm_var": 0.010640462239583334, "learning_rate": 0.0001, "loss": 4.4513, "loss/crossentropy": 2.157149076461792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271384835243225, "step": 7964 }, { "epoch": 0.15932, "grad_norm": 2.25, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 2.0308582186698914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23934553563594818, "step": 7966 }, { "epoch": 0.15936, "grad_norm": 2.234375, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 4.3977, "loss/crossentropy": 2.1855711936950684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.236568883061409, "step": 7968 }, { "epoch": 0.1594, "grad_norm": 2.375, "grad_norm_var": 0.07629292805989583, "learning_rate": 0.0001, "loss": 4.719, "loss/crossentropy": 2.4479551315307617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24693088978528976, "step": 7970 }, { "epoch": 0.15944, "grad_norm": 2.21875, "grad_norm_var": 0.07333882649739583, "learning_rate": 0.0001, "loss": 4.5221, "loss/crossentropy": 2.439974784851074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502682954072952, "step": 7972 }, { "epoch": 0.15948, "grad_norm": 2.375, "grad_norm_var": 0.07625325520833333, "learning_rate": 0.0001, "loss": 4.4024, "loss/crossentropy": 1.9899010062217712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23031124472618103, "step": 7974 }, { "epoch": 0.15952, "grad_norm": 2.109375, "grad_norm_var": 0.07517903645833333, "learning_rate": 0.0001, "loss": 4.2522, "loss/crossentropy": 1.830255150794983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984459012746811, "step": 7976 }, { "epoch": 0.15956, "grad_norm": 2.21875, "grad_norm_var": 0.07088114420572916, "learning_rate": 0.0001, "loss": 4.3832, "loss/crossentropy": 1.9675705432891846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21336429566144943, "step": 7978 }, { "epoch": 0.1596, "grad_norm": 2.1875, "grad_norm_var": 0.06965738932291667, "learning_rate": 0.0001, "loss": 4.4784, "loss/crossentropy": 2.030815005302429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102503478527069, "step": 7980 }, { "epoch": 0.15964, "grad_norm": 2.109375, "grad_norm_var": 0.07285054524739583, "learning_rate": 0.0001, "loss": 4.492, "loss/crossentropy": 2.4932440519332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246867336332798, "step": 7982 }, { "epoch": 0.15968, "grad_norm": 2.078125, "grad_norm_var": 0.07330322265625, "learning_rate": 0.0001, "loss": 4.2085, "loss/crossentropy": 1.5839802622795105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17231802642345428, "step": 7984 }, { "epoch": 0.15972, "grad_norm": 2.359375, "grad_norm_var": 0.0091217041015625, "learning_rate": 0.0001, "loss": 4.4045, "loss/crossentropy": 1.821477472782135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239791452884674, "step": 7986 }, { "epoch": 0.15976, "grad_norm": 2.109375, "grad_norm_var": 0.0093414306640625, "learning_rate": 0.0001, "loss": 4.0655, "loss/crossentropy": 2.013838052749634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335103377699852, "step": 7988 }, { "epoch": 0.1598, "grad_norm": 2.03125, "grad_norm_var": 0.006376139322916667, "learning_rate": 0.0001, "loss": 4.4989, "loss/crossentropy": 1.9412779211997986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20336110144853592, "step": 7990 }, { "epoch": 0.15984, "grad_norm": 2.0, "grad_norm_var": 0.008234659830729166, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 1.6653677225112915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025909200310707, "step": 7992 }, { "epoch": 0.15988, "grad_norm": 2.125, "grad_norm_var": 0.00865478515625, "learning_rate": 0.0001, "loss": 4.4299, "loss/crossentropy": 2.0069726705551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21300900727510452, "step": 7994 }, { "epoch": 0.15992, "grad_norm": 2.046875, "grad_norm_var": 0.009919230143229167, "learning_rate": 0.0001, "loss": 4.4395, "loss/crossentropy": 2.1118472814559937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25853876769542694, "step": 7996 }, { "epoch": 0.15996, "grad_norm": 2.34375, "grad_norm_var": 0.013622029622395834, "learning_rate": 0.0001, "loss": 4.1922, "loss/crossentropy": 1.608262836933136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958206593990326, "step": 7998 }, { "epoch": 0.16, "grad_norm": 2.203125, "grad_norm_var": 0.0138580322265625, "learning_rate": 0.0001, "loss": 4.2896, "loss/crossentropy": 1.6572073101997375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19102878868579865, "step": 8000 }, { "epoch": 0.16004, "grad_norm": 2.015625, "grad_norm_var": 0.0113677978515625, "learning_rate": 0.0001, "loss": 4.0811, "loss/crossentropy": 1.9421688318252563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19628942012786865, "step": 8002 }, { "epoch": 0.16008, "grad_norm": 2.265625, "grad_norm_var": 0.0134918212890625, "learning_rate": 0.0001, "loss": 4.5344, "loss/crossentropy": 2.2197489738464355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.248014435172081, "step": 8004 }, { "epoch": 0.16012, "grad_norm": 2.203125, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 4.4625, "loss/crossentropy": 2.200868308544159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2476908192038536, "step": 8006 }, { "epoch": 0.16016, "grad_norm": 2.34375, "grad_norm_var": 0.01162109375, "learning_rate": 0.0001, "loss": 4.8151, "loss/crossentropy": 2.3793649673461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356039509177208, "step": 8008 }, { "epoch": 0.1602, "grad_norm": 2.140625, "grad_norm_var": 0.0112945556640625, "learning_rate": 0.0001, "loss": 4.1849, "loss/crossentropy": 2.130257308483124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224358968436718, "step": 8010 }, { "epoch": 0.16024, "grad_norm": 2.15625, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 4.5948, "loss/crossentropy": 2.370365023612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23811528086662292, "step": 8012 }, { "epoch": 0.16028, "grad_norm": 2.0, "grad_norm_var": 0.008812459309895833, "learning_rate": 0.0001, "loss": 4.4326, "loss/crossentropy": 1.985486626625061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19639353454113007, "step": 8014 }, { "epoch": 0.16032, "grad_norm": 2.40625, "grad_norm_var": 0.01129150390625, "learning_rate": 0.0001, "loss": 4.4335, "loss/crossentropy": 2.2128632068634033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2340361252427101, "step": 8016 }, { "epoch": 0.16036, "grad_norm": 2.0625, "grad_norm_var": 0.009956868489583333, "learning_rate": 0.0001, "loss": 4.3688, "loss/crossentropy": 1.830498456954956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19760622829198837, "step": 8018 }, { "epoch": 0.1604, "grad_norm": 4.375, "grad_norm_var": 0.3061757405598958, "learning_rate": 0.0001, "loss": 4.6443, "loss/crossentropy": 1.9595977067947388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138657420873642, "step": 8020 }, { "epoch": 0.16044, "grad_norm": 2.1875, "grad_norm_var": 0.30684305826822916, "learning_rate": 0.0001, "loss": 4.3101, "loss/crossentropy": 2.400893449783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2615668326616287, "step": 8022 }, { "epoch": 0.16048, "grad_norm": 2.140625, "grad_norm_var": 0.3080963134765625, "learning_rate": 0.0001, "loss": 4.6359, "loss/crossentropy": 2.5079843997955322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251235693693161, "step": 8024 }, { "epoch": 0.16052, "grad_norm": 2.265625, "grad_norm_var": 0.30686848958333335, "learning_rate": 0.0001, "loss": 4.4973, "loss/crossentropy": 2.573891043663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2646481841802597, "step": 8026 }, { "epoch": 0.16056, "grad_norm": 2.1875, "grad_norm_var": 0.3064605712890625, "learning_rate": 0.0001, "loss": 4.4177, "loss/crossentropy": 2.0088363885879517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22888045758008957, "step": 8028 }, { "epoch": 0.1606, "grad_norm": 2.109375, "grad_norm_var": 0.31115697224934896, "learning_rate": 0.0001, "loss": 4.214, "loss/crossentropy": 2.4596647024154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2507014721632004, "step": 8030 }, { "epoch": 0.16064, "grad_norm": 2.1875, "grad_norm_var": 0.3093462626139323, "learning_rate": 0.0001, "loss": 4.6826, "loss/crossentropy": 2.204255223274231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22913406044244766, "step": 8032 }, { "epoch": 0.16068, "grad_norm": 2.125, "grad_norm_var": 0.30677261352539065, "learning_rate": 0.0001, "loss": 4.6603, "loss/crossentropy": 2.285408139228821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23185817897319794, "step": 8034 }, { "epoch": 0.16072, "grad_norm": 1.9609375, "grad_norm_var": 0.012520345052083333, "learning_rate": 0.0001, "loss": 3.9513, "loss/crossentropy": 2.0788660645484924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20811481028795242, "step": 8036 }, { "epoch": 0.16076, "grad_norm": 2.140625, "grad_norm_var": 0.01307373046875, "learning_rate": 0.0001, "loss": 4.5368, "loss/crossentropy": 2.398258686065674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2411409318447113, "step": 8038 }, { "epoch": 0.1608, "grad_norm": 2.140625, "grad_norm_var": 0.015579986572265624, "learning_rate": 0.0001, "loss": 4.0415, "loss/crossentropy": 2.3101454973220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22493668645620346, "step": 8040 }, { "epoch": 0.16084, "grad_norm": 2.203125, "grad_norm_var": 0.015134429931640625, "learning_rate": 0.0001, "loss": 4.0612, "loss/crossentropy": 1.9015105962753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22360052913427353, "step": 8042 }, { "epoch": 0.16088, "grad_norm": 2.34375, "grad_norm_var": 0.017704010009765625, "learning_rate": 0.0001, "loss": 4.216, "loss/crossentropy": 2.0112481117248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24062782526016235, "step": 8044 }, { "epoch": 0.16092, "grad_norm": 2.171875, "grad_norm_var": 0.0161773681640625, "learning_rate": 0.0001, "loss": 4.3638, "loss/crossentropy": 2.2024285793304443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959003806114197, "step": 8046 }, { "epoch": 0.16096, "grad_norm": 2.125, "grad_norm_var": 0.014412434895833333, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 2.1069058775901794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21980682760477066, "step": 8048 }, { "epoch": 0.161, "grad_norm": 2.3125, "grad_norm_var": 0.012540690104166667, "learning_rate": 0.0001, "loss": 4.2106, "loss/crossentropy": 1.8380340337753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201671525835991, "step": 8050 }, { "epoch": 0.16104, "grad_norm": 2.046875, "grad_norm_var": 0.010965728759765625, "learning_rate": 0.0001, "loss": 4.3778, "loss/crossentropy": 2.276741087436676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23787930607795715, "step": 8052 }, { "epoch": 0.16108, "grad_norm": 2.140625, "grad_norm_var": 0.010680898030598959, "learning_rate": 0.0001, "loss": 4.0784, "loss/crossentropy": 1.631809651851654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19654212146997452, "step": 8054 }, { "epoch": 0.16112, "grad_norm": 2.234375, "grad_norm_var": 0.007453409830729166, "learning_rate": 0.0001, "loss": 4.5927, "loss/crossentropy": 2.067444145679474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21420229971408844, "step": 8056 }, { "epoch": 0.16116, "grad_norm": 2.21875, "grad_norm_var": 0.011767578125, "learning_rate": 0.0001, "loss": 4.562, "loss/crossentropy": 2.384890556335449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312106341123581, "step": 8058 }, { "epoch": 0.1612, "grad_norm": 2.265625, "grad_norm_var": 0.010986328125, "learning_rate": 0.0001, "loss": 4.363, "loss/crossentropy": 2.3683160543441772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2527216002345085, "step": 8060 }, { "epoch": 0.16124, "grad_norm": 2.171875, "grad_norm_var": 0.011253865559895833, "learning_rate": 0.0001, "loss": 4.4576, "loss/crossentropy": 2.1545952558517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23928315192461014, "step": 8062 }, { "epoch": 0.16128, "grad_norm": 1.96875, "grad_norm_var": 0.013695271809895833, "learning_rate": 0.0001, "loss": 4.2285, "loss/crossentropy": 2.0792208313941956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2226722463965416, "step": 8064 }, { "epoch": 0.16132, "grad_norm": 2.09375, "grad_norm_var": 0.013068644205729167, "learning_rate": 0.0001, "loss": 3.8788, "loss/crossentropy": 2.181519627571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193506434559822, "step": 8066 }, { "epoch": 0.16136, "grad_norm": 2.375, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 4.4445, "loss/crossentropy": 1.7798657417297363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23413337767124176, "step": 8068 }, { "epoch": 0.1614, "grad_norm": 2.125, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 4.0731, "loss/crossentropy": 2.1232666969299316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23351240158081055, "step": 8070 }, { "epoch": 0.16144, "grad_norm": 2.203125, "grad_norm_var": 0.017609659830729166, "learning_rate": 0.0001, "loss": 4.6975, "loss/crossentropy": 2.34002685546875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2533388137817383, "step": 8072 }, { "epoch": 0.16148, "grad_norm": 2.21875, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 4.3487, "loss/crossentropy": 2.066399872303009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21156150847673416, "step": 8074 }, { "epoch": 0.16152, "grad_norm": 2.171875, "grad_norm_var": 0.010677083333333334, "learning_rate": 0.0001, "loss": 4.3646, "loss/crossentropy": 2.2298463582992554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2307809516787529, "step": 8076 }, { "epoch": 0.16156, "grad_norm": 2.0625, "grad_norm_var": 0.0105865478515625, "learning_rate": 0.0001, "loss": 4.1946, "loss/crossentropy": 1.9858508110046387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21499747782945633, "step": 8078 }, { "epoch": 0.1616, "grad_norm": 2.0625, "grad_norm_var": 0.011149088541666666, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 1.5669215321540833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889330968260765, "step": 8080 }, { "epoch": 0.16164, "grad_norm": 2.25, "grad_norm_var": 0.011881510416666666, "learning_rate": 0.0001, "loss": 4.4743, "loss/crossentropy": 2.296878218650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2405889928340912, "step": 8082 }, { "epoch": 0.16168, "grad_norm": 2.203125, "grad_norm_var": 0.007307942708333333, "learning_rate": 0.0001, "loss": 4.2205, "loss/crossentropy": 1.954626441001892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19809379428625107, "step": 8084 }, { "epoch": 0.16172, "grad_norm": 2.3125, "grad_norm_var": 0.008687337239583334, "learning_rate": 0.0001, "loss": 4.5923, "loss/crossentropy": 1.8640353083610535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.226090669631958, "step": 8086 }, { "epoch": 0.16176, "grad_norm": 2.28125, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 4.3363, "loss/crossentropy": 2.1914591789245605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24432511627674103, "step": 8088 }, { "epoch": 0.1618, "grad_norm": 2.21875, "grad_norm_var": 0.009798177083333333, "learning_rate": 0.0001, "loss": 4.7291, "loss/crossentropy": 2.192594051361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23682628571987152, "step": 8090 }, { "epoch": 0.16184, "grad_norm": 2.09375, "grad_norm_var": 0.010993448893229167, "learning_rate": 0.0001, "loss": 4.4592, "loss/crossentropy": 2.210235595703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230379119515419, "step": 8092 }, { "epoch": 0.16188, "grad_norm": 2.140625, "grad_norm_var": 0.011115519205729167, "learning_rate": 0.0001, "loss": 4.4111, "loss/crossentropy": 2.214667320251465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26643867790699005, "step": 8094 }, { "epoch": 0.16192, "grad_norm": 2.09375, "grad_norm_var": 0.009586588541666666, "learning_rate": 0.0001, "loss": 4.3131, "loss/crossentropy": 1.9808599948883057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20679324120283127, "step": 8096 }, { "epoch": 0.16196, "grad_norm": 2.109375, "grad_norm_var": 0.008333333333333333, "learning_rate": 0.0001, "loss": 4.4683, "loss/crossentropy": 2.076589345932007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23430196940898895, "step": 8098 }, { "epoch": 0.162, "grad_norm": 2.15625, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 4.5645, "loss/crossentropy": 2.364492177963257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23023054748773575, "step": 8100 }, { "epoch": 0.16204, "grad_norm": 2.140625, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 4.6595, "loss/crossentropy": 2.2908111214637756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21239468455314636, "step": 8102 }, { "epoch": 0.16208, "grad_norm": 2.40625, "grad_norm_var": 0.008958943684895833, "learning_rate": 0.0001, "loss": 4.5503, "loss/crossentropy": 1.8306183218955994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21511316299438477, "step": 8104 }, { "epoch": 0.16212, "grad_norm": 2.0625, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.1582, "loss/crossentropy": 2.1121758222579956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23471853882074356, "step": 8106 }, { "epoch": 0.16216, "grad_norm": 2.140625, "grad_norm_var": 0.012355295817057292, "learning_rate": 0.0001, "loss": 4.1464, "loss/crossentropy": 1.7613067030906677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025228664278984, "step": 8108 }, { "epoch": 0.1622, "grad_norm": 2.0, "grad_norm_var": 0.013844553629557292, "learning_rate": 0.0001, "loss": 4.0797, "loss/crossentropy": 2.1413058042526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21817607432603836, "step": 8110 }, { "epoch": 0.16224, "grad_norm": 2.265625, "grad_norm_var": 0.013641103108723959, "learning_rate": 0.0001, "loss": 4.5739, "loss/crossentropy": 2.375948429107666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30982857942581177, "step": 8112 }, { "epoch": 0.16228, "grad_norm": 2.1875, "grad_norm_var": 0.013396962483723959, "learning_rate": 0.0001, "loss": 4.076, "loss/crossentropy": 1.9669193029403687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21873307973146439, "step": 8114 }, { "epoch": 0.16232, "grad_norm": 2.21875, "grad_norm_var": 0.015592193603515625, "learning_rate": 0.0001, "loss": 4.4024, "loss/crossentropy": 2.2028547525405884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229970782995224, "step": 8116 }, { "epoch": 0.16236, "grad_norm": 2.078125, "grad_norm_var": 0.01622289021809896, "learning_rate": 0.0001, "loss": 4.3359, "loss/crossentropy": 2.082156002521515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262335941195488, "step": 8118 }, { "epoch": 0.1624, "grad_norm": 2.078125, "grad_norm_var": 0.012379709879557292, "learning_rate": 0.0001, "loss": 4.2737, "loss/crossentropy": 2.0538666248321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22941745817661285, "step": 8120 }, { "epoch": 0.16244, "grad_norm": 2.4375, "grad_norm_var": 0.015750885009765625, "learning_rate": 0.0001, "loss": 4.3615, "loss/crossentropy": 2.338989734649658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2434876710176468, "step": 8122 }, { "epoch": 0.16248, "grad_norm": 2.375, "grad_norm_var": 2.602311197916667, "learning_rate": 0.0001, "loss": 4.5472, "loss/crossentropy": 2.277916193008423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29285988211631775, "step": 8124 }, { "epoch": 0.16252, "grad_norm": 2.46875, "grad_norm_var": 2.567577107747396, "learning_rate": 0.0001, "loss": 4.3437, "loss/crossentropy": 2.1196334958076477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728030800819397, "step": 8126 }, { "epoch": 0.16256, "grad_norm": 2.15625, "grad_norm_var": 2.5655558268229166, "learning_rate": 0.0001, "loss": 4.4216, "loss/crossentropy": 2.0216450095176697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229092076420784, "step": 8128 }, { "epoch": 0.1626, "grad_norm": 2.171875, "grad_norm_var": 2.5546834309895834, "learning_rate": 0.0001, "loss": 4.5163, "loss/crossentropy": 2.098921537399292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22539281845092773, "step": 8130 }, { "epoch": 0.16264, "grad_norm": 2.03125, "grad_norm_var": 2.580052693684896, "learning_rate": 0.0001, "loss": 4.1066, "loss/crossentropy": 2.0186068415641785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20286529511213303, "step": 8132 }, { "epoch": 0.16268, "grad_norm": 2.28125, "grad_norm_var": 2.5584706624348956, "learning_rate": 0.0001, "loss": 4.7847, "loss/crossentropy": 2.157357335090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23342902958393097, "step": 8134 }, { "epoch": 0.16272, "grad_norm": 2.140625, "grad_norm_var": 2.5516998291015627, "learning_rate": 0.0001, "loss": 4.3489, "loss/crossentropy": 2.1885476112365723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22314801812171936, "step": 8136 }, { "epoch": 0.16276, "grad_norm": 2.21875, "grad_norm_var": 2.5796160380045574, "learning_rate": 0.0001, "loss": 4.3403, "loss/crossentropy": 2.2208757400512695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295266091823578, "step": 8138 }, { "epoch": 0.1628, "grad_norm": 2.015625, "grad_norm_var": 0.027522532145182292, "learning_rate": 0.0001, "loss": 4.0473, "loss/crossentropy": 1.7885233163833618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008970081806183, "step": 8140 }, { "epoch": 0.16284, "grad_norm": 2.078125, "grad_norm_var": 0.022989654541015626, "learning_rate": 0.0001, "loss": 4.0835, "loss/crossentropy": 2.152435064315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2311881259083748, "step": 8142 }, { "epoch": 0.16288, "grad_norm": 2.296875, "grad_norm_var": 0.016001129150390626, "learning_rate": 0.0001, "loss": 4.3384, "loss/crossentropy": 2.0818406343460083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24117180705070496, "step": 8144 }, { "epoch": 0.16292, "grad_norm": 2.140625, "grad_norm_var": 0.02513402303059896, "learning_rate": 0.0001, "loss": 4.4246, "loss/crossentropy": 2.1775856614112854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23246955126523972, "step": 8146 }, { "epoch": 0.16296, "grad_norm": 2.578125, "grad_norm_var": 0.033607737223307295, "learning_rate": 0.0001, "loss": 3.9993, "loss/crossentropy": 1.7029682397842407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19564303010702133, "step": 8148 }, { "epoch": 0.163, "grad_norm": 2.453125, "grad_norm_var": 0.03468195597330729, "learning_rate": 0.0001, "loss": 4.5631, "loss/crossentropy": 2.070194900035858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2382609099149704, "step": 8150 }, { "epoch": 0.16304, "grad_norm": 2.046875, "grad_norm_var": 0.036043039957682294, "learning_rate": 0.0001, "loss": 4.19, "loss/crossentropy": 1.9214876890182495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20894134789705276, "step": 8152 }, { "epoch": 0.16308, "grad_norm": 2.21875, "grad_norm_var": 0.03243815104166667, "learning_rate": 0.0001, "loss": 4.3622, "loss/crossentropy": 1.7311474084854126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962232068181038, "step": 8154 }, { "epoch": 0.16312, "grad_norm": 2.0, "grad_norm_var": 0.0323150634765625, "learning_rate": 0.0001, "loss": 4.4623, "loss/crossentropy": 2.2161877155303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22318138182163239, "step": 8156 }, { "epoch": 0.16316, "grad_norm": 2.078125, "grad_norm_var": 0.03186747233072917, "learning_rate": 0.0001, "loss": 4.7343, "loss/crossentropy": 2.21865177154541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234280064702034, "step": 8158 }, { "epoch": 0.1632, "grad_norm": 2.09375, "grad_norm_var": 0.032835896809895834, "learning_rate": 0.0001, "loss": 4.3277, "loss/crossentropy": 1.9517142176628113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964063122868538, "step": 8160 }, { "epoch": 0.16324, "grad_norm": 2.25, "grad_norm_var": 0.023713175455729166, "learning_rate": 0.0001, "loss": 4.3878, "loss/crossentropy": 2.261234760284424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24549759924411774, "step": 8162 }, { "epoch": 0.16328, "grad_norm": 2.109375, "grad_norm_var": 0.012751261393229166, "learning_rate": 0.0001, "loss": 4.159, "loss/crossentropy": 1.9791623950004578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018037736415863, "step": 8164 }, { "epoch": 0.16332, "grad_norm": 2.1875, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 4.6231, "loss/crossentropy": 2.3916029930114746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24446460604667664, "step": 8166 }, { "epoch": 0.16336, "grad_norm": 2.328125, "grad_norm_var": 0.007991536458333334, "learning_rate": 0.0001, "loss": 4.3251, "loss/crossentropy": 2.204437553882599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22334939241409302, "step": 8168 }, { "epoch": 0.1634, "grad_norm": 2.0625, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 4.3236, "loss/crossentropy": 2.2013272047042847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068365067243576, "step": 8170 }, { "epoch": 0.16344, "grad_norm": 2.03125, "grad_norm_var": 0.0065826416015625, "learning_rate": 0.0001, "loss": 4.2597, "loss/crossentropy": 1.8648701310157776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19690127670764923, "step": 8172 }, { "epoch": 0.16348, "grad_norm": 1.9453125, "grad_norm_var": 0.008857981363932291, "learning_rate": 0.0001, "loss": 4.1324, "loss/crossentropy": 2.221195936203003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900760903954506, "step": 8174 }, { "epoch": 0.16352, "grad_norm": 2.203125, "grad_norm_var": 0.009445953369140624, "learning_rate": 0.0001, "loss": 4.4874, "loss/crossentropy": 1.8648499846458435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982845515012741, "step": 8176 }, { "epoch": 0.16356, "grad_norm": 2.125, "grad_norm_var": 0.010762278238932292, "learning_rate": 0.0001, "loss": 4.588, "loss/crossentropy": 2.3032894134521484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25082532316446304, "step": 8178 }, { "epoch": 0.1636, "grad_norm": 2.203125, "grad_norm_var": 0.010931142171223958, "learning_rate": 0.0001, "loss": 4.3375, "loss/crossentropy": 2.0634626150131226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23546822369098663, "step": 8180 }, { "epoch": 0.16364, "grad_norm": 2.28125, "grad_norm_var": 0.012737782796223958, "learning_rate": 0.0001, "loss": 4.2402, "loss/crossentropy": 1.7406468391418457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20627902448177338, "step": 8182 }, { "epoch": 0.16368, "grad_norm": 2.125, "grad_norm_var": 0.012668609619140625, "learning_rate": 0.0001, "loss": 4.4103, "loss/crossentropy": 2.3812272548675537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25258868932724, "step": 8184 }, { "epoch": 0.16372, "grad_norm": 2.1875, "grad_norm_var": 0.012499745686848958, "learning_rate": 0.0001, "loss": 4.3584, "loss/crossentropy": 2.20754611492157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21015550196170807, "step": 8186 }, { "epoch": 0.16376, "grad_norm": 1.9921875, "grad_norm_var": 0.0140777587890625, "learning_rate": 0.0001, "loss": 4.1902, "loss/crossentropy": 2.081672966480255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19573034346103668, "step": 8188 }, { "epoch": 0.1638, "grad_norm": 2.140625, "grad_norm_var": 0.009905751546223958, "learning_rate": 0.0001, "loss": 4.5267, "loss/crossentropy": 2.184974491596222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2477174997329712, "step": 8190 }, { "epoch": 0.16384, "grad_norm": 2.125, "grad_norm_var": 0.010322825113932291, "learning_rate": 0.0001, "loss": 4.3804, "loss/crossentropy": 2.1048192977905273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22900478541851044, "step": 8192 }, { "epoch": 0.16388, "grad_norm": 2.046875, "grad_norm_var": 0.010135650634765625, "learning_rate": 0.0001, "loss": 4.1068, "loss/crossentropy": 1.960956335067749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23504969477653503, "step": 8194 }, { "epoch": 0.16392, "grad_norm": 2.484375, "grad_norm_var": 0.017114003499348957, "learning_rate": 0.0001, "loss": 4.7698, "loss/crossentropy": 2.158856213092804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200375646352768, "step": 8196 }, { "epoch": 0.16396, "grad_norm": 2.015625, "grad_norm_var": 0.017286936442057293, "learning_rate": 0.0001, "loss": 4.0533, "loss/crossentropy": 2.0954058170318604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22729048877954483, "step": 8198 }, { "epoch": 0.164, "grad_norm": 2.078125, "grad_norm_var": 0.01599299112955729, "learning_rate": 0.0001, "loss": 4.3492, "loss/crossentropy": 2.1452964544296265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21412881463766098, "step": 8200 }, { "epoch": 0.16404, "grad_norm": 1.8359375, "grad_norm_var": 0.02072321573893229, "learning_rate": 0.0001, "loss": 4.0204, "loss/crossentropy": 1.9737866520881653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19449186325073242, "step": 8202 }, { "epoch": 0.16408, "grad_norm": 2.109375, "grad_norm_var": 0.020344034830729166, "learning_rate": 0.0001, "loss": 4.5119, "loss/crossentropy": 2.213072657585144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333887815475464, "step": 8204 }, { "epoch": 0.16412, "grad_norm": 2.09375, "grad_norm_var": 0.0203277587890625, "learning_rate": 0.0001, "loss": 4.2139, "loss/crossentropy": 1.8702161312103271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21368274092674255, "step": 8206 }, { "epoch": 0.16416, "grad_norm": 1.9765625, "grad_norm_var": 0.02102635701497396, "learning_rate": 0.0001, "loss": 4.242, "loss/crossentropy": 2.177275776863098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21248809248209, "step": 8208 }, { "epoch": 0.1642, "grad_norm": 1.9375, "grad_norm_var": 0.021945953369140625, "learning_rate": 0.0001, "loss": 4.3151, "loss/crossentropy": 2.3422038555145264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288341298699379, "step": 8210 }, { "epoch": 0.16424, "grad_norm": 2.265625, "grad_norm_var": 0.014212799072265626, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 2.094432234764099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2224871665239334, "step": 8212 }, { "epoch": 0.16428, "grad_norm": 2.125, "grad_norm_var": 0.020072174072265626, "learning_rate": 0.0001, "loss": 4.5045, "loss/crossentropy": 2.3371682167053223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2710718363523483, "step": 8214 }, { "epoch": 0.16432, "grad_norm": 2.25, "grad_norm_var": 0.034242502848307294, "learning_rate": 0.0001, "loss": 4.6926, "loss/crossentropy": 1.8700988292694092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795634388923645, "step": 8216 }, { "epoch": 0.16436, "grad_norm": 2.046875, "grad_norm_var": 0.02575658162434896, "learning_rate": 0.0001, "loss": 4.3266, "loss/crossentropy": 2.080985188484192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23497651517391205, "step": 8218 }, { "epoch": 0.1644, "grad_norm": 2.046875, "grad_norm_var": 0.026364898681640624, "learning_rate": 0.0001, "loss": 4.3873, "loss/crossentropy": 2.3486984968185425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25393396615982056, "step": 8220 }, { "epoch": 0.16444, "grad_norm": 2.15625, "grad_norm_var": 0.028148396809895834, "learning_rate": 0.0001, "loss": 4.3446, "loss/crossentropy": 2.0084245800971985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21738936007022858, "step": 8222 }, { "epoch": 0.16448, "grad_norm": 2.03125, "grad_norm_var": 0.02846247355143229, "learning_rate": 0.0001, "loss": 3.8584, "loss/crossentropy": 1.536482572555542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17787320166826248, "step": 8224 }, { "epoch": 0.16452, "grad_norm": 2.1875, "grad_norm_var": 0.024857330322265624, "learning_rate": 0.0001, "loss": 4.494, "loss/crossentropy": 2.1727080941200256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22967493534088135, "step": 8226 }, { "epoch": 0.16456, "grad_norm": 2.015625, "grad_norm_var": 0.02490208943684896, "learning_rate": 0.0001, "loss": 4.1895, "loss/crossentropy": 1.6915069222450256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19113170355558395, "step": 8228 }, { "epoch": 0.1646, "grad_norm": 2.046875, "grad_norm_var": 0.022739410400390625, "learning_rate": 0.0001, "loss": 4.4378, "loss/crossentropy": 2.1645957231521606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21431444585323334, "step": 8230 }, { "epoch": 0.16464, "grad_norm": 2.09375, "grad_norm_var": 0.00858154296875, "learning_rate": 0.0001, "loss": 3.8088, "loss/crossentropy": 1.8797736763954163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19344569742679596, "step": 8232 }, { "epoch": 0.16468, "grad_norm": 2.1875, "grad_norm_var": 0.0148345947265625, "learning_rate": 0.0001, "loss": 4.3119, "loss/crossentropy": 2.092893421649933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21476523578166962, "step": 8234 }, { "epoch": 0.16472, "grad_norm": 2.125, "grad_norm_var": 0.0143310546875, "learning_rate": 0.0001, "loss": 4.4145, "loss/crossentropy": 2.336071252822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25474119186401367, "step": 8236 }, { "epoch": 0.16476, "grad_norm": 2.03125, "grad_norm_var": 0.014410146077473958, "learning_rate": 0.0001, "loss": 4.3054, "loss/crossentropy": 2.008872926235199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22672003507614136, "step": 8238 }, { "epoch": 0.1648, "grad_norm": 2.171875, "grad_norm_var": 0.011822255452473958, "learning_rate": 0.0001, "loss": 4.4997, "loss/crossentropy": 2.2229605317115784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2392737865447998, "step": 8240 }, { "epoch": 0.16484, "grad_norm": 2.171875, "grad_norm_var": 0.011525217692057292, "learning_rate": 0.0001, "loss": 4.5894, "loss/crossentropy": 2.1929808855056763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832375764846802, "step": 8242 }, { "epoch": 0.16488, "grad_norm": 2.171875, "grad_norm_var": 0.010625966389973958, "learning_rate": 0.0001, "loss": 4.642, "loss/crossentropy": 2.1557860374450684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24640469253063202, "step": 8244 }, { "epoch": 0.16492, "grad_norm": 2.109375, "grad_norm_var": 0.009582265218098959, "learning_rate": 0.0001, "loss": 4.2508, "loss/crossentropy": 2.2462236881256104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23639583587646484, "step": 8246 }, { "epoch": 0.16496, "grad_norm": 2.03125, "grad_norm_var": 0.009501139322916666, "learning_rate": 0.0001, "loss": 4.1476, "loss/crossentropy": 2.1238350868225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066531121730804, "step": 8248 }, { "epoch": 0.165, "grad_norm": 2.078125, "grad_norm_var": 0.0049479166666666664, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 1.6866248846054077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20677632093429565, "step": 8250 }, { "epoch": 0.16504, "grad_norm": 2.03125, "grad_norm_var": 0.005338541666666667, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 2.0640709400177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148914858698845, "step": 8252 }, { "epoch": 0.16508, "grad_norm": 2.234375, "grad_norm_var": 0.00513916015625, "learning_rate": 0.0001, "loss": 4.4138, "loss/crossentropy": 2.003119468688965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21817681193351746, "step": 8254 }, { "epoch": 0.16512, "grad_norm": 2.328125, "grad_norm_var": 0.007763671875, "learning_rate": 0.0001, "loss": 4.276, "loss/crossentropy": 2.053581953048706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22257455438375473, "step": 8256 }, { "epoch": 0.16516, "grad_norm": 2.046875, "grad_norm_var": 0.008381144205729166, "learning_rate": 0.0001, "loss": 4.1314, "loss/crossentropy": 1.788454830646515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21449437737464905, "step": 8258 }, { "epoch": 0.1652, "grad_norm": 2.265625, "grad_norm_var": 0.009032185872395833, "learning_rate": 0.0001, "loss": 4.3986, "loss/crossentropy": 1.8791787028312683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877290904521942, "step": 8260 }, { "epoch": 0.16524, "grad_norm": 2.140625, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 4.4326, "loss/crossentropy": 2.2346811294555664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.221164733171463, "step": 8262 }, { "epoch": 0.16528, "grad_norm": 2.03125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.3296, "loss/crossentropy": 2.1032413244247437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21829386800527573, "step": 8264 }, { "epoch": 0.16532, "grad_norm": 2.09375, "grad_norm_var": 0.008690388997395833, "learning_rate": 0.0001, "loss": 4.3188, "loss/crossentropy": 2.1452749967575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24733971804380417, "step": 8266 }, { "epoch": 0.16536, "grad_norm": 2.171875, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 4.2093, "loss/crossentropy": 1.8318313956260681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22139593213796616, "step": 8268 }, { "epoch": 0.1654, "grad_norm": 2.09375, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 1.9865980744361877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22354473173618317, "step": 8270 }, { "epoch": 0.16544, "grad_norm": 2.21875, "grad_norm_var": 0.005464680989583333, "learning_rate": 0.0001, "loss": 4.318, "loss/crossentropy": 2.140509843826294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832269966602325, "step": 8272 }, { "epoch": 0.16548, "grad_norm": 2.15625, "grad_norm_var": 0.005052693684895833, "learning_rate": 0.0001, "loss": 4.1632, "loss/crossentropy": 2.3763319849967957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22866757214069366, "step": 8274 }, { "epoch": 0.16552, "grad_norm": 2.03125, "grad_norm_var": 0.004198201497395833, "learning_rate": 0.0001, "loss": 4.2366, "loss/crossentropy": 1.785762071609497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19961480796337128, "step": 8276 }, { "epoch": 0.16556, "grad_norm": 7.34375, "grad_norm_var": 1.7181477864583334, "learning_rate": 0.0001, "loss": 4.5317, "loss/crossentropy": 2.0500977635383606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2972872704267502, "step": 8278 }, { "epoch": 0.1656, "grad_norm": 2.265625, "grad_norm_var": 1.6987589518229167, "learning_rate": 0.0001, "loss": 4.4247, "loss/crossentropy": 2.3665153980255127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24115745723247528, "step": 8280 }, { "epoch": 0.16564, "grad_norm": 2.25, "grad_norm_var": 1.6973592122395833, "learning_rate": 0.0001, "loss": 4.326, "loss/crossentropy": 1.9636226892471313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048879787325859, "step": 8282 }, { "epoch": 0.16568, "grad_norm": 2.171875, "grad_norm_var": 1.6948720296223958, "learning_rate": 0.0001, "loss": 4.4303, "loss/crossentropy": 2.2624993324279785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22186043858528137, "step": 8284 }, { "epoch": 0.16572, "grad_norm": 2.046875, "grad_norm_var": 1.7048886617024739, "learning_rate": 0.0001, "loss": 4.0816, "loss/crossentropy": 2.098397970199585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219336099922657, "step": 8286 }, { "epoch": 0.16576, "grad_norm": 2.140625, "grad_norm_var": 1.6997393290201823, "learning_rate": 0.0001, "loss": 4.4705, "loss/crossentropy": 1.9795190691947937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145320549607277, "step": 8288 }, { "epoch": 0.1658, "grad_norm": 2.203125, "grad_norm_var": 1.6936927795410157, "learning_rate": 0.0001, "loss": 4.4625, "loss/crossentropy": 2.1028788089752197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22656689584255219, "step": 8290 }, { "epoch": 0.16584, "grad_norm": 1.984375, "grad_norm_var": 1.7015398661295573, "learning_rate": 0.0001, "loss": 4.1901, "loss/crossentropy": 2.2936136722564697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23688847571611404, "step": 8292 }, { "epoch": 0.16588, "grad_norm": 2.234375, "grad_norm_var": 0.01862360636393229, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 1.870754897594452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20914533734321594, "step": 8294 }, { "epoch": 0.16592, "grad_norm": 2.109375, "grad_norm_var": 0.007999674479166666, "learning_rate": 0.0001, "loss": 3.9884, "loss/crossentropy": 2.034530520439148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22234197705984116, "step": 8296 }, { "epoch": 0.16596, "grad_norm": 2.203125, "grad_norm_var": 0.0073811848958333336, "learning_rate": 0.0001, "loss": 4.3674, "loss/crossentropy": 1.974400520324707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20479100942611694, "step": 8298 }, { "epoch": 0.166, "grad_norm": 2.21875, "grad_norm_var": 0.0078765869140625, "learning_rate": 0.0001, "loss": 4.3157, "loss/crossentropy": 2.180828809738159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229994386434555, "step": 8300 }, { "epoch": 0.16604, "grad_norm": 2.078125, "grad_norm_var": 0.007624308268229167, "learning_rate": 0.0001, "loss": 4.0704, "loss/crossentropy": 2.098487079143524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19709115475416183, "step": 8302 }, { "epoch": 0.16608, "grad_norm": 2.0625, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 4.2292, "loss/crossentropy": 1.999358892440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20891200006008148, "step": 8304 }, { "epoch": 0.16612, "grad_norm": 2.1875, "grad_norm_var": 0.009022776285807292, "learning_rate": 0.0001, "loss": 4.4263, "loss/crossentropy": 2.40866219997406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22355159372091293, "step": 8306 }, { "epoch": 0.16616, "grad_norm": 2.171875, "grad_norm_var": 0.008156077067057291, "learning_rate": 0.0001, "loss": 4.0532, "loss/crossentropy": 1.774325966835022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058887928724289, "step": 8308 }, { "epoch": 0.1662, "grad_norm": 2.125, "grad_norm_var": 0.007352447509765625, "learning_rate": 0.0001, "loss": 4.4951, "loss/crossentropy": 1.9870773553848267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21368569880723953, "step": 8310 }, { "epoch": 0.16624, "grad_norm": 2.40625, "grad_norm_var": 0.01121826171875, "learning_rate": 0.0001, "loss": 4.3496, "loss/crossentropy": 1.9224175810813904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279355376958847, "step": 8312 }, { "epoch": 0.16628, "grad_norm": 2.171875, "grad_norm_var": 0.012230428059895833, "learning_rate": 0.0001, "loss": 4.4074, "loss/crossentropy": 2.011419177055359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2363004833459854, "step": 8314 }, { "epoch": 0.16632, "grad_norm": 2.328125, "grad_norm_var": 0.01444091796875, "learning_rate": 0.0001, "loss": 4.6627, "loss/crossentropy": 1.9777795672416687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830055862665176, "step": 8316 }, { "epoch": 0.16636, "grad_norm": 2.046875, "grad_norm_var": 0.013508860270182292, "learning_rate": 0.0001, "loss": 4.3798, "loss/crossentropy": 2.3334981203079224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21089734882116318, "step": 8318 }, { "epoch": 0.1664, "grad_norm": 2.28125, "grad_norm_var": 0.015148671468098958, "learning_rate": 0.0001, "loss": 4.3792, "loss/crossentropy": 1.9137234687805176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21954041719436646, "step": 8320 }, { "epoch": 0.16644, "grad_norm": 2.15625, "grad_norm_var": 0.013492838541666666, "learning_rate": 0.0001, "loss": 4.3826, "loss/crossentropy": 2.1358155608177185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21321023255586624, "step": 8322 }, { "epoch": 0.16648, "grad_norm": 2.203125, "grad_norm_var": 0.013688151041666667, "learning_rate": 0.0001, "loss": 4.3454, "loss/crossentropy": 2.1747822165489197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23682481050491333, "step": 8324 }, { "epoch": 0.16652, "grad_norm": 2.078125, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 4.2685, "loss/crossentropy": 2.2818257808685303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24226247519254684, "step": 8326 }, { "epoch": 0.16656, "grad_norm": 2.1875, "grad_norm_var": 0.010155232747395833, "learning_rate": 0.0001, "loss": 4.4806, "loss/crossentropy": 1.9066791534423828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20610930025577545, "step": 8328 }, { "epoch": 0.1666, "grad_norm": 2.078125, "grad_norm_var": 0.008284505208333333, "learning_rate": 0.0001, "loss": 4.4799, "loss/crossentropy": 2.2431830763816833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22407780587673187, "step": 8330 }, { "epoch": 0.16664, "grad_norm": 2.125, "grad_norm_var": 0.0067708333333333336, "learning_rate": 0.0001, "loss": 4.5894, "loss/crossentropy": 2.569726347923279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2647472620010376, "step": 8332 }, { "epoch": 0.16668, "grad_norm": 2.140625, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 4.2307, "loss/crossentropy": 1.8364137411117554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113686427474022, "step": 8334 }, { "epoch": 0.16672, "grad_norm": 2.078125, "grad_norm_var": 0.007968902587890625, "learning_rate": 0.0001, "loss": 4.0369, "loss/crossentropy": 2.0510441064834595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268069088459015, "step": 8336 }, { "epoch": 0.16676, "grad_norm": 2.046875, "grad_norm_var": 0.007968902587890625, "learning_rate": 0.0001, "loss": 4.2528, "loss/crossentropy": 1.7306728959083557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20341812074184418, "step": 8338 }, { "epoch": 0.1668, "grad_norm": 2.578125, "grad_norm_var": 0.023361968994140624, "learning_rate": 0.0001, "loss": 4.8146, "loss/crossentropy": 2.398088574409485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2426001876592636, "step": 8340 }, { "epoch": 0.16684, "grad_norm": 2.109375, "grad_norm_var": 0.023128000895182292, "learning_rate": 0.0001, "loss": 4.0659, "loss/crossentropy": 2.1259487867355347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666773200035095, "step": 8342 }, { "epoch": 0.16688, "grad_norm": 2.109375, "grad_norm_var": 0.022141265869140624, "learning_rate": 0.0001, "loss": 4.2857, "loss/crossentropy": 1.9744665026664734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2887374758720398, "step": 8344 }, { "epoch": 0.16692, "grad_norm": 2.09375, "grad_norm_var": 0.02316869099934896, "learning_rate": 0.0001, "loss": 4.1989, "loss/crossentropy": 1.9841225743293762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008478343486786, "step": 8346 }, { "epoch": 0.16696, "grad_norm": 2.125, "grad_norm_var": 0.1913469950358073, "learning_rate": 0.0001, "loss": 4.5208, "loss/crossentropy": 2.060486137866974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23954987525939941, "step": 8348 }, { "epoch": 0.167, "grad_norm": 2.03125, "grad_norm_var": 0.19230931599934895, "learning_rate": 0.0001, "loss": 4.2185, "loss/crossentropy": 1.9356245398521423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987425833940506, "step": 8350 }, { "epoch": 0.16704, "grad_norm": 2.203125, "grad_norm_var": 0.1835845947265625, "learning_rate": 0.0001, "loss": 4.1875, "loss/crossentropy": 1.9968677163124084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21396416425704956, "step": 8352 }, { "epoch": 0.16708, "grad_norm": 2.109375, "grad_norm_var": 0.1820465087890625, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.1678614616394043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22860293090343475, "step": 8354 }, { "epoch": 0.16712, "grad_norm": 2.078125, "grad_norm_var": 0.17534891764322916, "learning_rate": 0.0001, "loss": 4.1761, "loss/crossentropy": 1.7752264142036438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20555391907691956, "step": 8356 }, { "epoch": 0.16716, "grad_norm": 2.046875, "grad_norm_var": 0.1767578125, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 1.7073925137519836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19325412809848785, "step": 8358 }, { "epoch": 0.1672, "grad_norm": 2.4375, "grad_norm_var": 0.17827860514322916, "learning_rate": 0.0001, "loss": 4.7185, "loss/crossentropy": 2.2200660705566406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21834757924079895, "step": 8360 }, { "epoch": 0.16724, "grad_norm": 2.03125, "grad_norm_var": 0.17594401041666666, "learning_rate": 0.0001, "loss": 4.3551, "loss/crossentropy": 2.3598183393478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25592371821403503, "step": 8362 }, { "epoch": 0.16728, "grad_norm": 2.03125, "grad_norm_var": 0.014644368489583334, "learning_rate": 0.0001, "loss": 4.3429, "loss/crossentropy": 2.342926025390625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2480776235461235, "step": 8364 }, { "epoch": 0.16732, "grad_norm": 2.046875, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 4.2802, "loss/crossentropy": 1.8505961894989014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21376194059848785, "step": 8366 }, { "epoch": 0.16736, "grad_norm": 2.078125, "grad_norm_var": 0.0146636962890625, "learning_rate": 0.0001, "loss": 4.4199, "loss/crossentropy": 2.2064108848571777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21334534883499146, "step": 8368 }, { "epoch": 0.1674, "grad_norm": 2.6875, "grad_norm_var": 0.0338043212890625, "learning_rate": 0.0001, "loss": 4.5344, "loss/crossentropy": 2.1280709505081177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24501197040081024, "step": 8370 }, { "epoch": 0.16744, "grad_norm": 2.203125, "grad_norm_var": 0.03312886555989583, "learning_rate": 0.0001, "loss": 4.3382, "loss/crossentropy": 2.2134695053100586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2368277609348297, "step": 8372 }, { "epoch": 0.16748, "grad_norm": 2.03125, "grad_norm_var": 0.03319905598958333, "learning_rate": 0.0001, "loss": 4.1197, "loss/crossentropy": 1.6942040920257568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21226602047681808, "step": 8374 }, { "epoch": 0.16752, "grad_norm": 2.09375, "grad_norm_var": 0.025755818684895834, "learning_rate": 0.0001, "loss": 4.5004, "loss/crossentropy": 2.1554355025291443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2655039578676224, "step": 8376 }, { "epoch": 0.16756, "grad_norm": 2.078125, "grad_norm_var": 0.024800618489583332, "learning_rate": 0.0001, "loss": 4.2446, "loss/crossentropy": 1.8107115030288696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756090670824051, "step": 8378 }, { "epoch": 0.1676, "grad_norm": 1.921875, "grad_norm_var": 0.026496378580729167, "learning_rate": 0.0001, "loss": 4.0745, "loss/crossentropy": 1.655519425868988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1760415881872177, "step": 8380 }, { "epoch": 0.16764, "grad_norm": 2.140625, "grad_norm_var": 0.026460774739583335, "learning_rate": 0.0001, "loss": 4.2369, "loss/crossentropy": 2.1618025302886963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21886380016803741, "step": 8382 }, { "epoch": 0.16768, "grad_norm": 2.109375, "grad_norm_var": 0.028270467122395834, "learning_rate": 0.0001, "loss": 4.0798, "loss/crossentropy": 2.043319880962372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21368451416492462, "step": 8384 }, { "epoch": 0.16772, "grad_norm": 2.1875, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.4035, "loss/crossentropy": 2.3043102025985718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25452760607004166, "step": 8386 }, { "epoch": 0.16776, "grad_norm": 2.21875, "grad_norm_var": 0.011546834309895834, "learning_rate": 0.0001, "loss": 4.5633, "loss/crossentropy": 2.3627192974090576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2438303530216217, "step": 8388 }, { "epoch": 0.1678, "grad_norm": 2.109375, "grad_norm_var": 0.011335245768229167, "learning_rate": 0.0001, "loss": 4.4339, "loss/crossentropy": 2.0960012674331665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20813053101301193, "step": 8390 }, { "epoch": 0.16784, "grad_norm": 1.921875, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.0401, "loss/crossentropy": 1.9132550358772278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19716795533895493, "step": 8392 }, { "epoch": 0.16788, "grad_norm": 2.078125, "grad_norm_var": 0.014742024739583333, "learning_rate": 0.0001, "loss": 4.2371, "loss/crossentropy": 2.1912059783935547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233934685587883, "step": 8394 }, { "epoch": 0.16792, "grad_norm": 2.0625, "grad_norm_var": 0.012691243489583334, "learning_rate": 0.0001, "loss": 4.4501, "loss/crossentropy": 2.165616512298584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23475389927625656, "step": 8396 }, { "epoch": 0.16796, "grad_norm": 2.078125, "grad_norm_var": 0.01265869140625, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.209625542163849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24280209839344025, "step": 8398 }, { "epoch": 0.168, "grad_norm": 2.125, "grad_norm_var": 0.013606516520182292, "learning_rate": 0.0001, "loss": 3.8959, "loss/crossentropy": 1.972103476524353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072778418660164, "step": 8400 }, { "epoch": 0.16804, "grad_norm": 2.171875, "grad_norm_var": 0.009663645426432292, "learning_rate": 0.0001, "loss": 4.4646, "loss/crossentropy": 2.402593731880188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23028723895549774, "step": 8402 }, { "epoch": 0.16808, "grad_norm": 2.125, "grad_norm_var": 0.008754221598307292, "learning_rate": 0.0001, "loss": 4.392, "loss/crossentropy": 2.218156576156616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24104444682598114, "step": 8404 }, { "epoch": 0.16812, "grad_norm": 2.09375, "grad_norm_var": 0.008722941080729166, "learning_rate": 0.0001, "loss": 4.2713, "loss/crossentropy": 1.9802079796791077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134247124195099, "step": 8406 }, { "epoch": 0.16816, "grad_norm": 1.953125, "grad_norm_var": 0.008577219645182292, "learning_rate": 0.0001, "loss": 3.9765, "loss/crossentropy": 2.0322983264923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21680974960327148, "step": 8408 }, { "epoch": 0.1682, "grad_norm": 2.03125, "grad_norm_var": 0.008194732666015624, "learning_rate": 0.0001, "loss": 4.2457, "loss/crossentropy": 2.11838436126709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319432571530342, "step": 8410 }, { "epoch": 0.16824, "grad_norm": 2.125, "grad_norm_var": 0.013398996988932292, "learning_rate": 0.0001, "loss": 4.2204, "loss/crossentropy": 2.3801279067993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23981253802776337, "step": 8412 }, { "epoch": 0.16828, "grad_norm": 2.078125, "grad_norm_var": 0.013042958577473958, "learning_rate": 0.0001, "loss": 4.2766, "loss/crossentropy": 2.0953307151794434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22835668921470642, "step": 8414 }, { "epoch": 0.16832, "grad_norm": 2.15625, "grad_norm_var": 0.011774698893229166, "learning_rate": 0.0001, "loss": 4.528, "loss/crossentropy": 2.0654338598251343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100541964173317, "step": 8416 }, { "epoch": 0.16836, "grad_norm": 2.0625, "grad_norm_var": 0.018184407552083334, "learning_rate": 0.0001, "loss": 4.5734, "loss/crossentropy": 2.383318305015564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246080219745636, "step": 8418 }, { "epoch": 0.1684, "grad_norm": 2.078125, "grad_norm_var": 0.018318684895833333, "learning_rate": 0.0001, "loss": 4.2764, "loss/crossentropy": 2.103544294834137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20659767091274261, "step": 8420 }, { "epoch": 0.16844, "grad_norm": 2.09375, "grad_norm_var": 0.017437489827473958, "learning_rate": 0.0001, "loss": 4.2737, "loss/crossentropy": 2.064394950866699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23458171635866165, "step": 8422 }, { "epoch": 0.16848, "grad_norm": 2.25, "grad_norm_var": 0.014872233072916666, "learning_rate": 0.0001, "loss": 4.4133, "loss/crossentropy": 2.087044835090637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23527196049690247, "step": 8424 }, { "epoch": 0.16852, "grad_norm": 2.15625, "grad_norm_var": 0.0133941650390625, "learning_rate": 0.0001, "loss": 4.5867, "loss/crossentropy": 2.41584312915802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2425323873758316, "step": 8426 }, { "epoch": 0.16856, "grad_norm": 2.09375, "grad_norm_var": 0.010400390625, "learning_rate": 0.0001, "loss": 4.3292, "loss/crossentropy": 2.2542352080345154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2330578714609146, "step": 8428 }, { "epoch": 0.1686, "grad_norm": 2.09375, "grad_norm_var": 0.010724894205729167, "learning_rate": 0.0001, "loss": 4.6628, "loss/crossentropy": 2.453263282775879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.280864879488945, "step": 8430 }, { "epoch": 0.16864, "grad_norm": 2.140625, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 4.0443, "loss/crossentropy": 1.9185429811477661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230744242668152, "step": 8432 }, { "epoch": 0.16868, "grad_norm": 2.140625, "grad_norm_var": 0.005956013997395833, "learning_rate": 0.0001, "loss": 4.1683, "loss/crossentropy": 2.020436644554138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21756108105182648, "step": 8434 }, { "epoch": 0.16872, "grad_norm": 2.328125, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.7727, "loss/crossentropy": 2.3050636053085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24950820207595825, "step": 8436 }, { "epoch": 0.16876, "grad_norm": 2.0625, "grad_norm_var": 0.01510009765625, "learning_rate": 0.0001, "loss": 4.4091, "loss/crossentropy": 2.2787232398986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24267014116048813, "step": 8438 }, { "epoch": 0.1688, "grad_norm": 2.078125, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 4.2927, "loss/crossentropy": 2.185176372528076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157401591539383, "step": 8440 }, { "epoch": 0.16884, "grad_norm": 2.265625, "grad_norm_var": 0.01558837890625, "learning_rate": 0.0001, "loss": 4.2087, "loss/crossentropy": 2.0673694610595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23145683109760284, "step": 8442 }, { "epoch": 0.16888, "grad_norm": 2.171875, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 4.0229, "loss/crossentropy": 1.9011740684509277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21629629284143448, "step": 8444 }, { "epoch": 0.16892, "grad_norm": 2.34375, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 4.4821, "loss/crossentropy": 2.055977463722229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275683432817459, "step": 8446 }, { "epoch": 0.16896, "grad_norm": 2.140625, "grad_norm_var": 0.0171295166015625, "learning_rate": 0.0001, "loss": 4.2152, "loss/crossentropy": 1.752756416797638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19437911361455917, "step": 8448 }, { "epoch": 0.169, "grad_norm": 2.09375, "grad_norm_var": 0.015706380208333332, "learning_rate": 0.0001, "loss": 4.3631, "loss/crossentropy": 2.479012131690979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23593680560588837, "step": 8450 }, { "epoch": 0.16904, "grad_norm": 2.1875, "grad_norm_var": 0.015523274739583334, "learning_rate": 0.0001, "loss": 4.3245, "loss/crossentropy": 2.065472185611725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148493528366089, "step": 8452 }, { "epoch": 0.16908, "grad_norm": 2.078125, "grad_norm_var": 0.015901692708333335, "learning_rate": 0.0001, "loss": 4.3955, "loss/crossentropy": 2.214062213897705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25444991141557693, "step": 8454 }, { "epoch": 0.16912, "grad_norm": 2.296875, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 4.2059, "loss/crossentropy": 1.7410383224487305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21134207397699356, "step": 8456 }, { "epoch": 0.16916, "grad_norm": 2.234375, "grad_norm_var": 0.020542144775390625, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.9303107857704163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18110749125480652, "step": 8458 }, { "epoch": 0.1692, "grad_norm": 2.203125, "grad_norm_var": 0.01810480753580729, "learning_rate": 0.0001, "loss": 4.3916, "loss/crossentropy": 1.9091919660568237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21320972591638565, "step": 8460 }, { "epoch": 0.16924, "grad_norm": 2.0625, "grad_norm_var": 0.025099436442057293, "learning_rate": 0.0001, "loss": 4.4075, "loss/crossentropy": 1.8021087050437927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21294282376766205, "step": 8462 }, { "epoch": 0.16928, "grad_norm": 2.0, "grad_norm_var": 0.027186838785807292, "learning_rate": 0.0001, "loss": 3.97, "loss/crossentropy": 2.018254518508911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21781788766384125, "step": 8464 }, { "epoch": 0.16932, "grad_norm": 2.0625, "grad_norm_var": 0.02906061808268229, "learning_rate": 0.0001, "loss": 4.5665, "loss/crossentropy": 2.0593737959861755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24223209917545319, "step": 8466 }, { "epoch": 0.16936, "grad_norm": 2.125, "grad_norm_var": 0.029504140218098957, "learning_rate": 0.0001, "loss": 4.3116, "loss/crossentropy": 2.218974232673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22679369151592255, "step": 8468 }, { "epoch": 0.1694, "grad_norm": 2.078125, "grad_norm_var": 0.022332509358723957, "learning_rate": 0.0001, "loss": 4.3313, "loss/crossentropy": 2.182355046272278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23690057545900345, "step": 8470 }, { "epoch": 0.16944, "grad_norm": 2.375, "grad_norm_var": 0.024930572509765624, "learning_rate": 0.0001, "loss": 4.6241, "loss/crossentropy": 2.1669063568115234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2458687722682953, "step": 8472 }, { "epoch": 0.16948, "grad_norm": 2.171875, "grad_norm_var": 0.021100870768229165, "learning_rate": 0.0001, "loss": 4.3181, "loss/crossentropy": 2.0656558871269226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20421817898750305, "step": 8474 }, { "epoch": 0.16952, "grad_norm": 2.03125, "grad_norm_var": 0.024625651041666665, "learning_rate": 0.0001, "loss": 4.4916, "loss/crossentropy": 2.1470741033554077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20798998326063156, "step": 8476 }, { "epoch": 0.16956, "grad_norm": 2.1875, "grad_norm_var": 0.014972941080729166, "learning_rate": 0.0001, "loss": 4.2616, "loss/crossentropy": 2.206741452217102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327529340982437, "step": 8478 }, { "epoch": 0.1696, "grad_norm": 2.171875, "grad_norm_var": 0.012596638997395833, "learning_rate": 0.0001, "loss": 4.3338, "loss/crossentropy": 2.4421777725219727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2385794073343277, "step": 8480 }, { "epoch": 0.16964, "grad_norm": 2.015625, "grad_norm_var": 0.0115234375, "learning_rate": 0.0001, "loss": 3.963, "loss/crossentropy": 1.8545736074447632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18871797621250153, "step": 8482 }, { "epoch": 0.16968, "grad_norm": 2.078125, "grad_norm_var": 0.0118072509765625, "learning_rate": 0.0001, "loss": 4.1352, "loss/crossentropy": 1.807646930217743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19625309854745865, "step": 8484 }, { "epoch": 0.16972, "grad_norm": 2.1875, "grad_norm_var": 0.012202962239583334, "learning_rate": 0.0001, "loss": 4.5029, "loss/crossentropy": 1.923595905303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233208492398262, "step": 8486 }, { "epoch": 0.16976, "grad_norm": 2.0625, "grad_norm_var": 0.0080718994140625, "learning_rate": 0.0001, "loss": 4.3251, "loss/crossentropy": 2.1018277406692505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2556355446577072, "step": 8488 }, { "epoch": 0.1698, "grad_norm": 1.90625, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 4.1127, "loss/crossentropy": 1.9638542532920837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101784572005272, "step": 8490 }, { "epoch": 0.16984, "grad_norm": 2.109375, "grad_norm_var": 0.008703358968098958, "learning_rate": 0.0001, "loss": 3.9463, "loss/crossentropy": 1.7770507335662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004244327545166, "step": 8492 }, { "epoch": 0.16988, "grad_norm": 2.171875, "grad_norm_var": 0.008573150634765625, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 2.4064877033233643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23509501665830612, "step": 8494 }, { "epoch": 0.16992, "grad_norm": 2.0, "grad_norm_var": 0.008713531494140624, "learning_rate": 0.0001, "loss": 4.3197, "loss/crossentropy": 2.183193802833557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22985967248678207, "step": 8496 }, { "epoch": 0.16996, "grad_norm": 2.203125, "grad_norm_var": 0.009126536051432292, "learning_rate": 0.0001, "loss": 4.4983, "loss/crossentropy": 2.241714060306549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22633583843708038, "step": 8498 }, { "epoch": 0.17, "grad_norm": 1.9921875, "grad_norm_var": 0.0097808837890625, "learning_rate": 0.0001, "loss": 4.32, "loss/crossentropy": 2.212075114250183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20900271832942963, "step": 8500 }, { "epoch": 0.17004, "grad_norm": 2.09375, "grad_norm_var": 0.0071685791015625, "learning_rate": 0.0001, "loss": 4.1885, "loss/crossentropy": 2.1283940076828003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21419794112443924, "step": 8502 }, { "epoch": 0.17008, "grad_norm": 2.15625, "grad_norm_var": 0.00731201171875, "learning_rate": 0.0001, "loss": 4.3498, "loss/crossentropy": 2.1958925127983093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23375380039215088, "step": 8504 }, { "epoch": 0.17012, "grad_norm": 2.171875, "grad_norm_var": 0.0065673828125, "learning_rate": 0.0001, "loss": 4.3396, "loss/crossentropy": 1.7883376479148865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097182646393776, "step": 8506 }, { "epoch": 0.17016, "grad_norm": 2.296875, "grad_norm_var": 0.006811269124348958, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.114001750946045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059268057346344, "step": 8508 }, { "epoch": 0.1702, "grad_norm": 2.15625, "grad_norm_var": 0.006929270426432292, "learning_rate": 0.0001, "loss": 4.3178, "loss/crossentropy": 2.2413129806518555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2517316862940788, "step": 8510 }, { "epoch": 0.17024, "grad_norm": 1.921875, "grad_norm_var": 0.015083567301432291, "learning_rate": 0.0001, "loss": 4.4202, "loss/crossentropy": 2.0968031883239746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2454070746898651, "step": 8512 }, { "epoch": 0.17028, "grad_norm": 1.875, "grad_norm_var": 0.01945978800455729, "learning_rate": 0.0001, "loss": 4.407, "loss/crossentropy": 1.9813454151153564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21238256990909576, "step": 8514 }, { "epoch": 0.17032, "grad_norm": 2.28125, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 4.4724, "loss/crossentropy": 2.261451005935669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2393329069018364, "step": 8516 }, { "epoch": 0.17036, "grad_norm": 2.171875, "grad_norm_var": 0.020654296875, "learning_rate": 0.0001, "loss": 4.5377, "loss/crossentropy": 1.9661999344825745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088441476225853, "step": 8518 }, { "epoch": 0.1704, "grad_norm": 2.046875, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 3.9288, "loss/crossentropy": 1.8275291323661804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20091407746076584, "step": 8520 }, { "epoch": 0.17044, "grad_norm": 2.046875, "grad_norm_var": 0.023346964518229166, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.266944646835327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2514675632119179, "step": 8522 }, { "epoch": 0.17048, "grad_norm": 2.28125, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.244120240211487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25534868240356445, "step": 8524 }, { "epoch": 0.17052, "grad_norm": 2.015625, "grad_norm_var": 0.025048828125, "learning_rate": 0.0001, "loss": 4.3037, "loss/crossentropy": 1.8628552556037903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21675898134708405, "step": 8526 }, { "epoch": 0.17056, "grad_norm": 2.109375, "grad_norm_var": 0.01597900390625, "learning_rate": 0.0001, "loss": 4.3883, "loss/crossentropy": 1.971808135509491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2143295481801033, "step": 8528 }, { "epoch": 0.1706, "grad_norm": 2.0625, "grad_norm_var": 0.0114410400390625, "learning_rate": 0.0001, "loss": 4.3481, "loss/crossentropy": 1.959191381931305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24014096707105637, "step": 8530 }, { "epoch": 0.17064, "grad_norm": 3.0, "grad_norm_var": 0.05729878743489583, "learning_rate": 0.0001, "loss": 4.2633, "loss/crossentropy": 1.836454451084137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20957449078559875, "step": 8532 }, { "epoch": 0.17068, "grad_norm": 2.140625, "grad_norm_var": 0.06005452473958333, "learning_rate": 0.0001, "loss": 4.1254, "loss/crossentropy": 2.003947675228119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538918048143387, "step": 8534 }, { "epoch": 0.17072, "grad_norm": 2.375, "grad_norm_var": 0.05788472493489583, "learning_rate": 0.0001, "loss": 4.4412, "loss/crossentropy": 2.0154194831848145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278473973274231, "step": 8536 }, { "epoch": 0.17076, "grad_norm": 2.015625, "grad_norm_var": 0.05548502604166667, "learning_rate": 0.0001, "loss": 4.2426, "loss/crossentropy": 2.1560275554656982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21601636707782745, "step": 8538 }, { "epoch": 0.1708, "grad_norm": 2.078125, "grad_norm_var": 0.05689697265625, "learning_rate": 0.0001, "loss": 4.2258, "loss/crossentropy": 2.1494773626327515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22883395850658417, "step": 8540 }, { "epoch": 0.17084, "grad_norm": 2.234375, "grad_norm_var": 0.05607808430989583, "learning_rate": 0.0001, "loss": 4.5201, "loss/crossentropy": 2.20908784866333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26363062113523483, "step": 8542 }, { "epoch": 0.17088, "grad_norm": 2.0625, "grad_norm_var": 0.0566802978515625, "learning_rate": 0.0001, "loss": 4.2477, "loss/crossentropy": 1.7932087182998657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18538028001785278, "step": 8544 }, { "epoch": 0.17092, "grad_norm": 2.078125, "grad_norm_var": 0.05625, "learning_rate": 0.0001, "loss": 4.471, "loss/crossentropy": 2.069899260997772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22498925775289536, "step": 8546 }, { "epoch": 0.17096, "grad_norm": 2.46875, "grad_norm_var": 0.019169108072916666, "learning_rate": 0.0001, "loss": 4.474, "loss/crossentropy": 2.008604884147644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21895557641983032, "step": 8548 }, { "epoch": 0.171, "grad_norm": 2.171875, "grad_norm_var": 0.018798828125, "learning_rate": 0.0001, "loss": 4.1973, "loss/crossentropy": 2.0652626156806946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23341115564107895, "step": 8550 }, { "epoch": 0.17104, "grad_norm": 2.234375, "grad_norm_var": 0.015607706705729167, "learning_rate": 0.0001, "loss": 4.383, "loss/crossentropy": 1.9780349135398865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22891747951507568, "step": 8552 }, { "epoch": 0.17108, "grad_norm": 2.5, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 4.4424, "loss/crossentropy": 2.163089871406555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2447328343987465, "step": 8554 }, { "epoch": 0.17112, "grad_norm": 2.21875, "grad_norm_var": 0.022945149739583334, "learning_rate": 0.0001, "loss": 4.1503, "loss/crossentropy": 2.2946064472198486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24353116750717163, "step": 8556 }, { "epoch": 0.17116, "grad_norm": 2.078125, "grad_norm_var": 0.022459920247395834, "learning_rate": 0.0001, "loss": 4.3049, "loss/crossentropy": 2.0238161087036133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2153262495994568, "step": 8558 }, { "epoch": 0.1712, "grad_norm": 2.09375, "grad_norm_var": 0.02604955037434896, "learning_rate": 0.0001, "loss": 3.8636, "loss/crossentropy": 1.5683120489120483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17940818518400192, "step": 8560 }, { "epoch": 0.17124, "grad_norm": 2.296875, "grad_norm_var": 0.027337392171223957, "learning_rate": 0.0001, "loss": 4.3867, "loss/crossentropy": 1.9956589937210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22829821705818176, "step": 8562 }, { "epoch": 0.17128, "grad_norm": 2.640625, "grad_norm_var": 0.036834462483723955, "learning_rate": 0.0001, "loss": 4.7055, "loss/crossentropy": 2.2242285013198853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23567892611026764, "step": 8564 }, { "epoch": 0.17132, "grad_norm": 2.046875, "grad_norm_var": 0.033719635009765624, "learning_rate": 0.0001, "loss": 4.274, "loss/crossentropy": 2.2143776416778564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2457498088479042, "step": 8566 }, { "epoch": 0.17136, "grad_norm": 2.15625, "grad_norm_var": 0.033782704671223955, "learning_rate": 0.0001, "loss": 4.354, "loss/crossentropy": 1.852292537689209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24004538357257843, "step": 8568 }, { "epoch": 0.1714, "grad_norm": 2.21875, "grad_norm_var": 0.02676976521809896, "learning_rate": 0.0001, "loss": 4.4429, "loss/crossentropy": 2.311089515686035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22059186547994614, "step": 8570 }, { "epoch": 0.17144, "grad_norm": 2.0625, "grad_norm_var": 0.027675120035807292, "learning_rate": 0.0001, "loss": 4.1697, "loss/crossentropy": 1.9324169754981995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21051711589097977, "step": 8572 }, { "epoch": 0.17148, "grad_norm": 2.171875, "grad_norm_var": 0.02904052734375, "learning_rate": 0.0001, "loss": 4.2457, "loss/crossentropy": 1.982999861240387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084646075963974, "step": 8574 }, { "epoch": 0.17152, "grad_norm": 2.078125, "grad_norm_var": 0.02505671183268229, "learning_rate": 0.0001, "loss": 4.2503, "loss/crossentropy": 2.1837204694747925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22953644394874573, "step": 8576 }, { "epoch": 0.17156, "grad_norm": 2.25, "grad_norm_var": 0.024102528889973957, "learning_rate": 0.0001, "loss": 4.4448, "loss/crossentropy": 2.2588841319084167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22851599752902985, "step": 8578 }, { "epoch": 0.1716, "grad_norm": 2.390625, "grad_norm_var": 0.011502838134765625, "learning_rate": 0.0001, "loss": 4.2033, "loss/crossentropy": 1.982733964920044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21284686028957367, "step": 8580 }, { "epoch": 0.17164, "grad_norm": 2.046875, "grad_norm_var": 0.012143707275390625, "learning_rate": 0.0001, "loss": 4.1238, "loss/crossentropy": 2.443873167037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23966002464294434, "step": 8582 }, { "epoch": 0.17168, "grad_norm": 5.09375, "grad_norm_var": 0.5580645243326823, "learning_rate": 0.0001, "loss": 4.2654, "loss/crossentropy": 2.462417483329773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2341725453734398, "step": 8584 }, { "epoch": 0.17172, "grad_norm": 2.453125, "grad_norm_var": 0.5521705627441407, "learning_rate": 0.0001, "loss": 4.183, "loss/crossentropy": 1.8569464683532715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224809430539608, "step": 8586 }, { "epoch": 0.17176, "grad_norm": 2.609375, "grad_norm_var": 0.55804443359375, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 2.017254650592804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2259984165430069, "step": 8588 }, { "epoch": 0.1718, "grad_norm": 2.1875, "grad_norm_var": 0.550066884358724, "learning_rate": 0.0001, "loss": 4.463, "loss/crossentropy": 1.9804525971412659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844170451164246, "step": 8590 }, { "epoch": 0.17184, "grad_norm": 2.015625, "grad_norm_var": 0.544781239827474, "learning_rate": 0.0001, "loss": 4.2676, "loss/crossentropy": 2.2506592869758606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22404606640338898, "step": 8592 }, { "epoch": 0.17188, "grad_norm": 2.0625, "grad_norm_var": 0.5579335530598958, "learning_rate": 0.0001, "loss": 4.197, "loss/crossentropy": 1.9515153765678406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216490276157856, "step": 8594 }, { "epoch": 0.17192, "grad_norm": 2.109375, "grad_norm_var": 0.5598052978515625, "learning_rate": 0.0001, "loss": 4.4037, "loss/crossentropy": 2.090883791446686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22496677190065384, "step": 8596 }, { "epoch": 0.17196, "grad_norm": 2.828125, "grad_norm_var": 0.5523844401041667, "learning_rate": 0.0001, "loss": 4.8429, "loss/crossentropy": 2.4344359636306763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2748369127511978, "step": 8598 }, { "epoch": 0.172, "grad_norm": 2.15625, "grad_norm_var": 0.05840250651041667, "learning_rate": 0.0001, "loss": 4.4359, "loss/crossentropy": 1.9280555844306946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20140594244003296, "step": 8600 }, { "epoch": 0.17204, "grad_norm": 2.15625, "grad_norm_var": 0.05537007649739583, "learning_rate": 0.0001, "loss": 4.2802, "loss/crossentropy": 2.045006573200226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21741003543138504, "step": 8602 }, { "epoch": 0.17208, "grad_norm": 2.0625, "grad_norm_var": 0.04461034138997396, "learning_rate": 0.0001, "loss": 4.1622, "loss/crossentropy": 2.092265546321869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20971956849098206, "step": 8604 }, { "epoch": 0.17212, "grad_norm": 1.9375, "grad_norm_var": 0.048130035400390625, "learning_rate": 0.0001, "loss": 4.16, "loss/crossentropy": 1.794768512248993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20680297911167145, "step": 8606 }, { "epoch": 0.17216, "grad_norm": 2.109375, "grad_norm_var": 0.05690078735351563, "learning_rate": 0.0001, "loss": 4.2213, "loss/crossentropy": 1.9316805601119995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163936346769333, "step": 8608 }, { "epoch": 0.1722, "grad_norm": 2.359375, "grad_norm_var": 0.05872294108072917, "learning_rate": 0.0001, "loss": 4.5981, "loss/crossentropy": 2.2786675691604614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2411205694079399, "step": 8610 }, { "epoch": 0.17224, "grad_norm": 2.234375, "grad_norm_var": 0.05852457682291667, "learning_rate": 0.0001, "loss": 4.5688, "loss/crossentropy": 1.9211469888687134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23195043951272964, "step": 8612 }, { "epoch": 0.17228, "grad_norm": 1.9921875, "grad_norm_var": 0.037393951416015626, "learning_rate": 0.0001, "loss": 4.1776, "loss/crossentropy": 1.900360643863678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2339140772819519, "step": 8614 }, { "epoch": 0.17232, "grad_norm": 2.125, "grad_norm_var": 0.03743464152018229, "learning_rate": 0.0001, "loss": 4.4408, "loss/crossentropy": 1.976994514465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218951515853405, "step": 8616 }, { "epoch": 0.17236, "grad_norm": 2.09375, "grad_norm_var": 0.035982004801432294, "learning_rate": 0.0001, "loss": 4.1988, "loss/crossentropy": 2.045244038105011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20886047929525375, "step": 8618 }, { "epoch": 0.1724, "grad_norm": 2.25, "grad_norm_var": 0.03240534464518229, "learning_rate": 0.0001, "loss": 4.5082, "loss/crossentropy": 2.21256685256958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21809116005897522, "step": 8620 }, { "epoch": 0.17244, "grad_norm": 2.171875, "grad_norm_var": 0.02641779581705729, "learning_rate": 0.0001, "loss": 4.6127, "loss/crossentropy": 2.300337314605713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22060109674930573, "step": 8622 }, { "epoch": 0.17248, "grad_norm": 2.015625, "grad_norm_var": 0.016932932535807292, "learning_rate": 0.0001, "loss": 4.2411, "loss/crossentropy": 1.8734883666038513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20403072237968445, "step": 8624 }, { "epoch": 0.17252, "grad_norm": 2.5625, "grad_norm_var": 0.01962865193684896, "learning_rate": 0.0001, "loss": 4.6907, "loss/crossentropy": 2.1382813453674316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27397096157073975, "step": 8626 }, { "epoch": 0.17256, "grad_norm": 2.0625, "grad_norm_var": 0.019760894775390624, "learning_rate": 0.0001, "loss": 4.0917, "loss/crossentropy": 1.9718505144119263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20658842474222183, "step": 8628 }, { "epoch": 0.1726, "grad_norm": 2.125, "grad_norm_var": 0.016706339518229165, "learning_rate": 0.0001, "loss": 4.5233, "loss/crossentropy": 2.0957319736480713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187560573220253, "step": 8630 }, { "epoch": 0.17264, "grad_norm": 1.984375, "grad_norm_var": 0.019954427083333334, "learning_rate": 0.0001, "loss": 4.0986, "loss/crossentropy": 2.0504234433174133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22376062721014023, "step": 8632 }, { "epoch": 0.17268, "grad_norm": 2.21875, "grad_norm_var": 0.05233968098958333, "learning_rate": 0.0001, "loss": 4.4513, "loss/crossentropy": 2.057171046733856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23209689557552338, "step": 8634 }, { "epoch": 0.17272, "grad_norm": 2.109375, "grad_norm_var": 0.052611287434895834, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 1.9635317921638489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21854296326637268, "step": 8636 }, { "epoch": 0.17276, "grad_norm": 2.359375, "grad_norm_var": 0.0546539306640625, "learning_rate": 0.0001, "loss": 4.3124, "loss/crossentropy": 1.8973188400268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24171262234449387, "step": 8638 }, { "epoch": 0.1728, "grad_norm": 2.109375, "grad_norm_var": 0.05347900390625, "learning_rate": 0.0001, "loss": 4.2068, "loss/crossentropy": 1.6730469465255737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17975886166095734, "step": 8640 }, { "epoch": 0.17284, "grad_norm": 2.265625, "grad_norm_var": 0.04702123006184896, "learning_rate": 0.0001, "loss": 4.4222, "loss/crossentropy": 2.2531689405441284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170827016234398, "step": 8642 }, { "epoch": 0.17288, "grad_norm": 2.171875, "grad_norm_var": 0.045873769124348956, "learning_rate": 0.0001, "loss": 4.0249, "loss/crossentropy": 2.0913639068603516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22586838155984879, "step": 8644 }, { "epoch": 0.17292, "grad_norm": 2.140625, "grad_norm_var": 0.04533869425455729, "learning_rate": 0.0001, "loss": 4.45, "loss/crossentropy": 2.163489580154419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2290281057357788, "step": 8646 }, { "epoch": 0.17296, "grad_norm": 2.15625, "grad_norm_var": 0.04267552693684896, "learning_rate": 0.0001, "loss": 4.3198, "loss/crossentropy": 2.0669034719467163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22411519289016724, "step": 8648 }, { "epoch": 0.173, "grad_norm": 2.046875, "grad_norm_var": 0.008213043212890625, "learning_rate": 0.0001, "loss": 4.0474, "loss/crossentropy": 1.9942336678504944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21262076497077942, "step": 8650 }, { "epoch": 0.17304, "grad_norm": 2.1875, "grad_norm_var": 0.009474436442057291, "learning_rate": 0.0001, "loss": 4.2701, "loss/crossentropy": 2.046514868736267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2376151606440544, "step": 8652 }, { "epoch": 0.17308, "grad_norm": 2.046875, "grad_norm_var": 0.005995432535807292, "learning_rate": 0.0001, "loss": 4.3308, "loss/crossentropy": 1.8385429382324219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19642101973295212, "step": 8654 }, { "epoch": 0.17312, "grad_norm": 2.125, "grad_norm_var": 0.007252756754557292, "learning_rate": 0.0001, "loss": 4.446, "loss/crossentropy": 2.259633481502533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24319174885749817, "step": 8656 }, { "epoch": 0.17316, "grad_norm": 2.15625, "grad_norm_var": 0.005301920572916666, "learning_rate": 0.0001, "loss": 4.2067, "loss/crossentropy": 1.9811018109321594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21801364421844482, "step": 8658 }, { "epoch": 0.1732, "grad_norm": 2.203125, "grad_norm_var": 0.0059722900390625, "learning_rate": 0.0001, "loss": 4.2158, "loss/crossentropy": 2.1726362705230713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23193368315696716, "step": 8660 }, { "epoch": 0.17324, "grad_norm": 2.1875, "grad_norm_var": 0.006086222330729167, "learning_rate": 0.0001, "loss": 4.2587, "loss/crossentropy": 1.9915854930877686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111910656094551, "step": 8662 }, { "epoch": 0.17328, "grad_norm": 2.671875, "grad_norm_var": 0.026851399739583334, "learning_rate": 0.0001, "loss": 4.5001, "loss/crossentropy": 1.9651137590408325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242890365421772, "step": 8664 }, { "epoch": 0.17332, "grad_norm": 2.15625, "grad_norm_var": 0.026008097330729167, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 1.865262508392334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21065659821033478, "step": 8666 }, { "epoch": 0.17336, "grad_norm": 2.0625, "grad_norm_var": 0.02535400390625, "learning_rate": 0.0001, "loss": 4.3402, "loss/crossentropy": 1.9073076248168945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959603875875473, "step": 8668 }, { "epoch": 0.1734, "grad_norm": 2.078125, "grad_norm_var": 0.024332682291666668, "learning_rate": 0.0001, "loss": 4.2965, "loss/crossentropy": 2.167983889579773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22176912426948547, "step": 8670 }, { "epoch": 0.17344, "grad_norm": 2.34375, "grad_norm_var": 0.5460896809895833, "learning_rate": 0.0001, "loss": 4.4551, "loss/crossentropy": 1.7029761672019958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.222617506980896, "step": 8672 }, { "epoch": 0.17348, "grad_norm": 2.046875, "grad_norm_var": 0.5439849853515625, "learning_rate": 0.0001, "loss": 4.3414, "loss/crossentropy": 2.053748309612274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23398682475090027, "step": 8674 }, { "epoch": 0.17352, "grad_norm": 2.703125, "grad_norm_var": 0.5408274332682291, "learning_rate": 0.0001, "loss": 4.7882, "loss/crossentropy": 2.309812903404236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2961876690387726, "step": 8676 }, { "epoch": 0.17356, "grad_norm": 1.9765625, "grad_norm_var": 0.5421953837076823, "learning_rate": 0.0001, "loss": 4.4416, "loss/crossentropy": 2.1045809984207153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22072184830904007, "step": 8678 }, { "epoch": 0.1736, "grad_norm": 2.015625, "grad_norm_var": 0.5490435282389323, "learning_rate": 0.0001, "loss": 4.4437, "loss/crossentropy": 2.2114070653915405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2208957076072693, "step": 8680 }, { "epoch": 0.17364, "grad_norm": 2.21875, "grad_norm_var": 0.5490435282389323, "learning_rate": 0.0001, "loss": 4.5931, "loss/crossentropy": 2.1773669719696045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22639526426792145, "step": 8682 }, { "epoch": 0.17368, "grad_norm": 2.234375, "grad_norm_var": 0.5398272196451823, "learning_rate": 0.0001, "loss": 3.9397, "loss/crossentropy": 1.4213417768478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17653951048851013, "step": 8684 }, { "epoch": 0.17372, "grad_norm": 1.9609375, "grad_norm_var": 0.5425374348958333, "learning_rate": 0.0001, "loss": 4.2711, "loss/crossentropy": 1.968630075454712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007492408156395, "step": 8686 }, { "epoch": 0.17376, "grad_norm": 2.21875, "grad_norm_var": 0.030304972330729166, "learning_rate": 0.0001, "loss": 4.2297, "loss/crossentropy": 2.0826632976531982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21895240992307663, "step": 8688 }, { "epoch": 0.1738, "grad_norm": 2.21875, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 4.4188, "loss/crossentropy": 2.2756701707839966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22170037031173706, "step": 8690 }, { "epoch": 0.17384, "grad_norm": 2.0625, "grad_norm_var": 0.010773722330729167, "learning_rate": 0.0001, "loss": 4.51, "loss/crossentropy": 2.329536557197571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24717354029417038, "step": 8692 }, { "epoch": 0.17388, "grad_norm": 1.9921875, "grad_norm_var": 0.010033162434895833, "learning_rate": 0.0001, "loss": 4.0776, "loss/crossentropy": 2.077241063117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215146966278553, "step": 8694 }, { "epoch": 0.17392, "grad_norm": 2.109375, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.3482, "loss/crossentropy": 2.2363221645355225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23428452014923096, "step": 8696 }, { "epoch": 0.17396, "grad_norm": 2.0, "grad_norm_var": 0.01024169921875, "learning_rate": 0.0001, "loss": 4.1321, "loss/crossentropy": 2.055815279483795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22674524784088135, "step": 8698 }, { "epoch": 0.174, "grad_norm": 2.015625, "grad_norm_var": 0.0098541259765625, "learning_rate": 0.0001, "loss": 4.2789, "loss/crossentropy": 2.205570936203003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23787499964237213, "step": 8700 }, { "epoch": 0.17404, "grad_norm": 2.0625, "grad_norm_var": 0.009895579020182291, "learning_rate": 0.0001, "loss": 4.4636, "loss/crossentropy": 2.262540578842163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23921719938516617, "step": 8702 }, { "epoch": 0.17408, "grad_norm": 1.9921875, "grad_norm_var": 0.010114542643229167, "learning_rate": 0.0001, "loss": 4.3372, "loss/crossentropy": 2.5464816093444824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23302219063043594, "step": 8704 }, { "epoch": 0.17412, "grad_norm": 2.09375, "grad_norm_var": 0.00897216796875, "learning_rate": 0.0001, "loss": 4.2355, "loss/crossentropy": 2.050383508205414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23393510282039642, "step": 8706 }, { "epoch": 0.17416, "grad_norm": 2.046875, "grad_norm_var": 0.0076812744140625, "learning_rate": 0.0001, "loss": 3.9943, "loss/crossentropy": 1.9034642577171326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20801686495542526, "step": 8708 }, { "epoch": 0.1742, "grad_norm": 2.09375, "grad_norm_var": 0.007045237223307291, "learning_rate": 0.0001, "loss": 4.2801, "loss/crossentropy": 2.313044309616089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24881915748119354, "step": 8710 }, { "epoch": 0.17424, "grad_norm": 2.015625, "grad_norm_var": 0.009388987223307292, "learning_rate": 0.0001, "loss": 4.3754, "loss/crossentropy": 1.973829746246338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21400053054094315, "step": 8712 }, { "epoch": 0.17428, "grad_norm": 2.25, "grad_norm_var": 0.010109202067057291, "learning_rate": 0.0001, "loss": 4.2936, "loss/crossentropy": 1.831783950328827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855721831321716, "step": 8714 }, { "epoch": 0.17432, "grad_norm": 2.15625, "grad_norm_var": 0.009683990478515625, "learning_rate": 0.0001, "loss": 4.2681, "loss/crossentropy": 2.0173734426498413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22365443408489227, "step": 8716 }, { "epoch": 0.17436, "grad_norm": 2.125, "grad_norm_var": 0.007155100504557292, "learning_rate": 0.0001, "loss": 4.4177, "loss/crossentropy": 1.6534234285354614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20528900623321533, "step": 8718 }, { "epoch": 0.1744, "grad_norm": 2.125, "grad_norm_var": 0.006396484375, "learning_rate": 0.0001, "loss": 4.3658, "loss/crossentropy": 1.8113531470298767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21433213353157043, "step": 8720 }, { "epoch": 0.17444, "grad_norm": 2.125, "grad_norm_var": 0.0064280192057291664, "learning_rate": 0.0001, "loss": 4.5135, "loss/crossentropy": 2.0750836730003357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22202756255865097, "step": 8722 }, { "epoch": 0.17448, "grad_norm": 2.046875, "grad_norm_var": 0.0056955973307291664, "learning_rate": 0.0001, "loss": 4.2512, "loss/crossentropy": 2.1388206481933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24137140065431595, "step": 8724 }, { "epoch": 0.17452, "grad_norm": 2.0625, "grad_norm_var": 0.006745402018229167, "learning_rate": 0.0001, "loss": 4.0401, "loss/crossentropy": 2.068696141242981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215234711766243, "step": 8726 }, { "epoch": 0.17456, "grad_norm": 1.984375, "grad_norm_var": 0.005280558268229167, "learning_rate": 0.0001, "loss": 4.0277, "loss/crossentropy": 1.6970900893211365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21108710020780563, "step": 8728 }, { "epoch": 0.1746, "grad_norm": 2.15625, "grad_norm_var": 0.005582682291666667, "learning_rate": 0.0001, "loss": 3.8555, "loss/crossentropy": 1.8847576975822449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151619866490364, "step": 8730 }, { "epoch": 0.17464, "grad_norm": 2.328125, "grad_norm_var": 0.009403483072916666, "learning_rate": 0.0001, "loss": 4.4088, "loss/crossentropy": 2.4103721380233765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2589537426829338, "step": 8732 }, { "epoch": 0.17468, "grad_norm": 2.203125, "grad_norm_var": 0.01011962890625, "learning_rate": 0.0001, "loss": 4.1415, "loss/crossentropy": 1.8340824842453003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22282177209854126, "step": 8734 }, { "epoch": 0.17472, "grad_norm": 2.078125, "grad_norm_var": 0.010302734375, "learning_rate": 0.0001, "loss": 4.2472, "loss/crossentropy": 1.88236665725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193661779165268, "step": 8736 }, { "epoch": 0.17476, "grad_norm": 2.28125, "grad_norm_var": 0.05056050618489583, "learning_rate": 0.0001, "loss": 4.441, "loss/crossentropy": 1.8121293783187866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21855003386735916, "step": 8738 }, { "epoch": 0.1748, "grad_norm": 2.203125, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 2.215694308280945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2405528798699379, "step": 8740 }, { "epoch": 0.17484, "grad_norm": 2.0625, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 4.2431, "loss/crossentropy": 2.124837279319763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22565175592899323, "step": 8742 }, { "epoch": 0.17488, "grad_norm": 2.203125, "grad_norm_var": 0.0476959228515625, "learning_rate": 0.0001, "loss": 4.6783, "loss/crossentropy": 2.2531429529190063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.259593665599823, "step": 8744 }, { "epoch": 0.17492, "grad_norm": 2.046875, "grad_norm_var": 0.04537760416666667, "learning_rate": 0.0001, "loss": 4.3546, "loss/crossentropy": 2.403178572654724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2475409209728241, "step": 8746 }, { "epoch": 0.17496, "grad_norm": 2.328125, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 4.5698, "loss/crossentropy": 1.7886858582496643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21677076816558838, "step": 8748 }, { "epoch": 0.175, "grad_norm": 2.109375, "grad_norm_var": 0.04397379557291667, "learning_rate": 0.0001, "loss": 4.1374, "loss/crossentropy": 1.9257569313049316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2098483294248581, "step": 8750 }, { "epoch": 0.17504, "grad_norm": 2.078125, "grad_norm_var": 0.04396870930989583, "learning_rate": 0.0001, "loss": 4.407, "loss/crossentropy": 2.1609140634536743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22846446931362152, "step": 8752 }, { "epoch": 0.17508, "grad_norm": 2.0625, "grad_norm_var": 0.0081939697265625, "learning_rate": 0.0001, "loss": 4.2683, "loss/crossentropy": 2.529700756072998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2515157088637352, "step": 8754 }, { "epoch": 0.17512, "grad_norm": 1.9921875, "grad_norm_var": 0.008548736572265625, "learning_rate": 0.0001, "loss": 4.4965, "loss/crossentropy": 2.1920565366744995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24070476740598679, "step": 8756 }, { "epoch": 0.17516, "grad_norm": 1.9453125, "grad_norm_var": 0.0097900390625, "learning_rate": 0.0001, "loss": 3.8416, "loss/crossentropy": 1.7714558839797974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787928193807602, "step": 8758 }, { "epoch": 0.1752, "grad_norm": 2.171875, "grad_norm_var": 0.009666951497395833, "learning_rate": 0.0001, "loss": 4.4457, "loss/crossentropy": 1.986818790435791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736730098724365, "step": 8760 }, { "epoch": 0.17524, "grad_norm": 2.015625, "grad_norm_var": 0.010453287760416667, "learning_rate": 0.0001, "loss": 4.2731, "loss/crossentropy": 1.8152282238006592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22459527850151062, "step": 8762 }, { "epoch": 0.17528, "grad_norm": 2.109375, "grad_norm_var": 0.0069163004557291664, "learning_rate": 0.0001, "loss": 4.4058, "loss/crossentropy": 2.2312777042388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23017627000808716, "step": 8764 }, { "epoch": 0.17532, "grad_norm": 2.375, "grad_norm_var": 0.013252766927083333, "learning_rate": 0.0001, "loss": 4.4714, "loss/crossentropy": 2.107849955558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21224269270896912, "step": 8766 }, { "epoch": 0.17536, "grad_norm": 2.15625, "grad_norm_var": 0.013206990559895833, "learning_rate": 0.0001, "loss": 4.1641, "loss/crossentropy": 2.1588711738586426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200954109430313, "step": 8768 }, { "epoch": 0.1754, "grad_norm": 2.1875, "grad_norm_var": 0.0130126953125, "learning_rate": 0.0001, "loss": 4.4056, "loss/crossentropy": 2.1355313062667847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24048195779323578, "step": 8770 }, { "epoch": 0.17544, "grad_norm": 2.125, "grad_norm_var": 0.011557769775390626, "learning_rate": 0.0001, "loss": 4.3505, "loss/crossentropy": 2.477591037750244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24459562450647354, "step": 8772 }, { "epoch": 0.17548, "grad_norm": 2.046875, "grad_norm_var": 0.00845947265625, "learning_rate": 0.0001, "loss": 4.4184, "loss/crossentropy": 2.2577285766601562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23038798570632935, "step": 8774 }, { "epoch": 0.17552, "grad_norm": 2.15625, "grad_norm_var": 0.009813435872395833, "learning_rate": 0.0001, "loss": 4.2681, "loss/crossentropy": 2.239536762237549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22290733456611633, "step": 8776 }, { "epoch": 0.17556, "grad_norm": 2.125, "grad_norm_var": 0.007373046875, "learning_rate": 0.0001, "loss": 4.4351, "loss/crossentropy": 1.9139958024024963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175864800810814, "step": 8778 }, { "epoch": 0.1756, "grad_norm": 2.0625, "grad_norm_var": 0.00888671875, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 2.0622661113739014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223067432641983, "step": 8780 }, { "epoch": 0.17564, "grad_norm": 2.15625, "grad_norm_var": 0.004524739583333334, "learning_rate": 0.0001, "loss": 4.4552, "loss/crossentropy": 2.222475051879883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181221306324005, "step": 8782 }, { "epoch": 0.17568, "grad_norm": 1.9296875, "grad_norm_var": 0.007002512613932292, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.2612074613571167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24079158157110214, "step": 8784 }, { "epoch": 0.17572, "grad_norm": 2.078125, "grad_norm_var": 0.005863189697265625, "learning_rate": 0.0001, "loss": 4.0889, "loss/crossentropy": 2.1629387736320496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21437199413776398, "step": 8786 }, { "epoch": 0.17576, "grad_norm": 2.328125, "grad_norm_var": 0.009492746988932292, "learning_rate": 0.0001, "loss": 4.6583, "loss/crossentropy": 2.145151972770691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797442018985748, "step": 8788 }, { "epoch": 0.1758, "grad_norm": 2.453125, "grad_norm_var": 0.015457916259765624, "learning_rate": 0.0001, "loss": 4.5188, "loss/crossentropy": 2.0366984605789185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2191104218363762, "step": 8790 }, { "epoch": 0.17584, "grad_norm": 2.203125, "grad_norm_var": 0.014422353108723958, "learning_rate": 0.0001, "loss": 4.1604, "loss/crossentropy": 2.1049715280532837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230033777654171, "step": 8792 }, { "epoch": 0.17588, "grad_norm": 2.15625, "grad_norm_var": 0.014338938395182292, "learning_rate": 0.0001, "loss": 4.4327, "loss/crossentropy": 2.2549991607666016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23084092140197754, "step": 8794 }, { "epoch": 0.17592, "grad_norm": 2.125, "grad_norm_var": 0.015547688802083333, "learning_rate": 0.0001, "loss": 4.4653, "loss/crossentropy": 1.9873813390731812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879335910081863, "step": 8796 }, { "epoch": 0.17596, "grad_norm": 2.140625, "grad_norm_var": 0.015677897135416667, "learning_rate": 0.0001, "loss": 4.6371, "loss/crossentropy": 2.0723283886909485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20538055896759033, "step": 8798 }, { "epoch": 0.176, "grad_norm": 2.125, "grad_norm_var": 0.013099924723307291, "learning_rate": 0.0001, "loss": 4.0313, "loss/crossentropy": 2.090642750263214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329270914196968, "step": 8800 }, { "epoch": 0.17604, "grad_norm": 2.1875, "grad_norm_var": 0.012672678629557291, "learning_rate": 0.0001, "loss": 4.4626, "loss/crossentropy": 2.3432271480560303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2456662431359291, "step": 8802 }, { "epoch": 0.17608, "grad_norm": 2.21875, "grad_norm_var": 0.010465240478515625, "learning_rate": 0.0001, "loss": 4.5133, "loss/crossentropy": 2.1210837364196777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22580894827842712, "step": 8804 }, { "epoch": 0.17612, "grad_norm": 2.0, "grad_norm_var": 0.005147043863932292, "learning_rate": 0.0001, "loss": 4.024, "loss/crossentropy": 2.142494797706604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426588833332062, "step": 8806 }, { "epoch": 0.17616, "grad_norm": 2.0625, "grad_norm_var": 0.0049435933430989586, "learning_rate": 0.0001, "loss": 4.2877, "loss/crossentropy": 1.9163227677345276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21529845893383026, "step": 8808 }, { "epoch": 0.1762, "grad_norm": 2.140625, "grad_norm_var": 0.004937489827473958, "learning_rate": 0.0001, "loss": 4.4776, "loss/crossentropy": 2.1478612422943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2354428842663765, "step": 8810 }, { "epoch": 0.17624, "grad_norm": 2.15625, "grad_norm_var": 0.0033854166666666668, "learning_rate": 0.0001, "loss": 4.435, "loss/crossentropy": 2.1546601057052612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22553270310163498, "step": 8812 }, { "epoch": 0.17628, "grad_norm": 2.140625, "grad_norm_var": 0.004233551025390625, "learning_rate": 0.0001, "loss": 4.1661, "loss/crossentropy": 2.0559862852096558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19714127480983734, "step": 8814 }, { "epoch": 0.17632, "grad_norm": 2.140625, "grad_norm_var": 0.004078928629557292, "learning_rate": 0.0001, "loss": 4.3543, "loss/crossentropy": 2.1340363025665283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22422882914543152, "step": 8816 }, { "epoch": 0.17636, "grad_norm": 2.078125, "grad_norm_var": 0.0038937886555989584, "learning_rate": 0.0001, "loss": 4.4464, "loss/crossentropy": 2.265942335128784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23249086737632751, "step": 8818 }, { "epoch": 0.1764, "grad_norm": 2.0625, "grad_norm_var": 0.0031939188639322916, "learning_rate": 0.0001, "loss": 4.3187, "loss/crossentropy": 2.245513081550598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22869569063186646, "step": 8820 }, { "epoch": 0.17644, "grad_norm": 2.0, "grad_norm_var": 0.006461334228515625, "learning_rate": 0.0001, "loss": 4.179, "loss/crossentropy": 1.851025104522705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21989689767360687, "step": 8822 }, { "epoch": 0.17648, "grad_norm": 2.09375, "grad_norm_var": 0.006266021728515625, "learning_rate": 0.0001, "loss": 4.276, "loss/crossentropy": 2.2972241640090942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22793132066726685, "step": 8824 }, { "epoch": 0.17652, "grad_norm": 2.078125, "grad_norm_var": 0.006276194254557292, "learning_rate": 0.0001, "loss": 4.3955, "loss/crossentropy": 2.248735189437866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133597657084465, "step": 8826 }, { "epoch": 0.17656, "grad_norm": 2.15625, "grad_norm_var": 0.008314768473307291, "learning_rate": 0.0001, "loss": 4.4423, "loss/crossentropy": 2.4173099994659424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23985996842384338, "step": 8828 }, { "epoch": 0.1766, "grad_norm": 2.28125, "grad_norm_var": 0.008698527018229167, "learning_rate": 0.0001, "loss": 4.5425, "loss/crossentropy": 2.5017653703689575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26119648665189743, "step": 8830 }, { "epoch": 0.17664, "grad_norm": 2.09375, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 4.2474, "loss/crossentropy": 1.9006813764572144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19938994944095612, "step": 8832 }, { "epoch": 0.17668, "grad_norm": 2.046875, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 4.1643, "loss/crossentropy": 2.101746916770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20544035732746124, "step": 8834 }, { "epoch": 0.17672, "grad_norm": 2.15625, "grad_norm_var": 0.010542805989583333, "learning_rate": 0.0001, "loss": 4.3211, "loss/crossentropy": 2.1605160236358643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23061934113502502, "step": 8836 }, { "epoch": 0.17676, "grad_norm": 2.296875, "grad_norm_var": 0.008967081705729166, "learning_rate": 0.0001, "loss": 4.4357, "loss/crossentropy": 1.963772177696228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736530423164368, "step": 8838 }, { "epoch": 0.1768, "grad_norm": 2.1875, "grad_norm_var": 0.009733072916666667, "learning_rate": 0.0001, "loss": 4.3572, "loss/crossentropy": 2.154300093650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22221273183822632, "step": 8840 }, { "epoch": 0.17684, "grad_norm": 2.109375, "grad_norm_var": 0.009501139322916666, "learning_rate": 0.0001, "loss": 4.4876, "loss/crossentropy": 2.1576497554779053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21834726631641388, "step": 8842 }, { "epoch": 0.17688, "grad_norm": 1.9375, "grad_norm_var": 0.0115875244140625, "learning_rate": 0.0001, "loss": 4.1192, "loss/crossentropy": 2.1316112279891968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166184037923813, "step": 8844 }, { "epoch": 0.17692, "grad_norm": 2.15625, "grad_norm_var": 0.0098052978515625, "learning_rate": 0.0001, "loss": 4.2421, "loss/crossentropy": 2.068525493144989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23183748871088028, "step": 8846 }, { "epoch": 0.17696, "grad_norm": 2.203125, "grad_norm_var": 0.01051025390625, "learning_rate": 0.0001, "loss": 4.5128, "loss/crossentropy": 2.185767650604248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25881427526474, "step": 8848 }, { "epoch": 0.177, "grad_norm": 2.125, "grad_norm_var": 0.010619099934895833, "learning_rate": 0.0001, "loss": 4.3073, "loss/crossentropy": 1.979454517364502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20669714361429214, "step": 8850 }, { "epoch": 0.17704, "grad_norm": 2.109375, "grad_norm_var": 0.009373982747395834, "learning_rate": 0.0001, "loss": 4.3356, "loss/crossentropy": 2.3473092317581177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23232270777225494, "step": 8852 }, { "epoch": 0.17708, "grad_norm": 2.046875, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.3732, "loss/crossentropy": 2.461324691772461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.242417573928833, "step": 8854 }, { "epoch": 0.17712, "grad_norm": 2.125, "grad_norm_var": 0.008385976155598959, "learning_rate": 0.0001, "loss": 4.1107, "loss/crossentropy": 1.5953214168548584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17768454551696777, "step": 8856 }, { "epoch": 0.17716, "grad_norm": 2.15625, "grad_norm_var": 0.008377838134765624, "learning_rate": 0.0001, "loss": 4.4289, "loss/crossentropy": 2.1969146728515625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21466109156608582, "step": 8858 }, { "epoch": 0.1772, "grad_norm": 1.921875, "grad_norm_var": 0.008459218343098958, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 2.243234634399414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22729554027318954, "step": 8860 }, { "epoch": 0.17724, "grad_norm": 2.296875, "grad_norm_var": 0.010628000895182291, "learning_rate": 0.0001, "loss": 4.2062, "loss/crossentropy": 2.1855397820472717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24905066192150116, "step": 8862 }, { "epoch": 0.17728, "grad_norm": 2.203125, "grad_norm_var": 0.010628000895182291, "learning_rate": 0.0001, "loss": 4.4075, "loss/crossentropy": 2.320886254310608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22135943174362183, "step": 8864 }, { "epoch": 0.17732, "grad_norm": 1.96875, "grad_norm_var": 0.011736806233723958, "learning_rate": 0.0001, "loss": 4.1444, "loss/crossentropy": 2.140891909599304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109990492463112, "step": 8866 }, { "epoch": 0.17736, "grad_norm": 1.921875, "grad_norm_var": 0.013038889567057291, "learning_rate": 0.0001, "loss": 4.1009, "loss/crossentropy": 2.147824764251709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131204530596733, "step": 8868 }, { "epoch": 0.1774, "grad_norm": 2.71875, "grad_norm_var": 0.039249420166015625, "learning_rate": 0.0001, "loss": 4.7136, "loss/crossentropy": 2.187807321548462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335718423128128, "step": 8870 }, { "epoch": 0.17744, "grad_norm": 2.1875, "grad_norm_var": 0.037287394205729164, "learning_rate": 0.0001, "loss": 4.5265, "loss/crossentropy": 2.3127458095550537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2385788857936859, "step": 8872 }, { "epoch": 0.17748, "grad_norm": 2.046875, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 4.1364, "loss/crossentropy": 1.859586775302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204575777053833, "step": 8874 }, { "epoch": 0.17752, "grad_norm": 2.0, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 4.1993, "loss/crossentropy": 1.9626107215881348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21016598492860794, "step": 8876 }, { "epoch": 0.17756, "grad_norm": 2.046875, "grad_norm_var": 0.034077962239583336, "learning_rate": 0.0001, "loss": 4.462, "loss/crossentropy": 2.1130539774894714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21995095163583755, "step": 8878 }, { "epoch": 0.1776, "grad_norm": 2.203125, "grad_norm_var": 0.035374959309895836, "learning_rate": 0.0001, "loss": 4.4677, "loss/crossentropy": 1.8914743065834045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20437531173229218, "step": 8880 }, { "epoch": 0.17764, "grad_norm": 2.125, "grad_norm_var": 0.032763671875, "learning_rate": 0.0001, "loss": 4.4975, "loss/crossentropy": 2.2135708332061768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2478664517402649, "step": 8882 }, { "epoch": 0.17768, "grad_norm": 2.203125, "grad_norm_var": 0.028922526041666667, "learning_rate": 0.0001, "loss": 4.3763, "loss/crossentropy": 2.194110333919525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366471290588379, "step": 8884 }, { "epoch": 0.17772, "grad_norm": 1.9609375, "grad_norm_var": 0.009822336832682292, "learning_rate": 0.0001, "loss": 4.2844, "loss/crossentropy": 2.4365748167037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25842973589897156, "step": 8886 }, { "epoch": 0.17776, "grad_norm": 2.078125, "grad_norm_var": 0.010001373291015626, "learning_rate": 0.0001, "loss": 4.0574, "loss/crossentropy": 1.9800177216529846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21982619166374207, "step": 8888 }, { "epoch": 0.1778, "grad_norm": 2.15625, "grad_norm_var": 0.009956614176432291, "learning_rate": 0.0001, "loss": 4.3465, "loss/crossentropy": 2.1437748670578003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21494000405073166, "step": 8890 }, { "epoch": 0.17784, "grad_norm": 2.1875, "grad_norm_var": 0.008957672119140624, "learning_rate": 0.0001, "loss": 4.4051, "loss/crossentropy": 2.0610267519950867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22473593801259995, "step": 8892 }, { "epoch": 0.17788, "grad_norm": 1.953125, "grad_norm_var": 0.013952382405598958, "learning_rate": 0.0001, "loss": 4.4637, "loss/crossentropy": 2.261958599090576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2250354364514351, "step": 8894 }, { "epoch": 0.17792, "grad_norm": 2.109375, "grad_norm_var": 0.012237294514973959, "learning_rate": 0.0001, "loss": 4.3508, "loss/crossentropy": 2.3689773082733154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22185539454221725, "step": 8896 }, { "epoch": 0.17796, "grad_norm": 2.15625, "grad_norm_var": 0.013034820556640625, "learning_rate": 0.0001, "loss": 4.7432, "loss/crossentropy": 2.612341523170471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2561237961053848, "step": 8898 }, { "epoch": 0.178, "grad_norm": 2.0625, "grad_norm_var": 0.010990142822265625, "learning_rate": 0.0001, "loss": 4.5876, "loss/crossentropy": 2.1230576038360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23115848749876022, "step": 8900 }, { "epoch": 0.17804, "grad_norm": 2.109375, "grad_norm_var": 0.009626261393229167, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 2.200004458427429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22002413868904114, "step": 8902 }, { "epoch": 0.17808, "grad_norm": 2.03125, "grad_norm_var": 0.009989420572916666, "learning_rate": 0.0001, "loss": 4.3438, "loss/crossentropy": 2.323713779449463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22923698276281357, "step": 8904 }, { "epoch": 0.17812, "grad_norm": 2.140625, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 4.3079, "loss/crossentropy": 2.038426458835602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21465667337179184, "step": 8906 }, { "epoch": 0.17816, "grad_norm": 2.109375, "grad_norm_var": 0.01217041015625, "learning_rate": 0.0001, "loss": 4.4498, "loss/crossentropy": 2.3639097213745117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2378860041499138, "step": 8908 }, { "epoch": 0.1782, "grad_norm": 2.046875, "grad_norm_var": 0.00699462890625, "learning_rate": 0.0001, "loss": 4.261, "loss/crossentropy": 1.8291080594062805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19894887506961823, "step": 8910 }, { "epoch": 0.17824, "grad_norm": 2.265625, "grad_norm_var": 0.011263020833333333, "learning_rate": 0.0001, "loss": 4.5606, "loss/crossentropy": 2.3113714456558228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24526391178369522, "step": 8912 }, { "epoch": 0.17828, "grad_norm": 2.109375, "grad_norm_var": 0.011449178059895834, "learning_rate": 0.0001, "loss": 4.3016, "loss/crossentropy": 2.114617943763733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305133402347565, "step": 8914 }, { "epoch": 0.17832, "grad_norm": 2.15625, "grad_norm_var": 0.014975738525390626, "learning_rate": 0.0001, "loss": 3.9333, "loss/crossentropy": 1.6893808841705322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19195494800806046, "step": 8916 }, { "epoch": 0.17836, "grad_norm": 2.234375, "grad_norm_var": 0.016721343994140624, "learning_rate": 0.0001, "loss": 4.2501, "loss/crossentropy": 1.829396367073059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181780844926834, "step": 8918 }, { "epoch": 0.1784, "grad_norm": 2.03125, "grad_norm_var": 0.01869481404622396, "learning_rate": 0.0001, "loss": 4.1542, "loss/crossentropy": 1.8910154104232788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20975399017333984, "step": 8920 }, { "epoch": 0.17844, "grad_norm": 2.140625, "grad_norm_var": 0.01953709920247396, "learning_rate": 0.0001, "loss": 4.3327, "loss/crossentropy": 2.0501255989074707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22336408495903015, "step": 8922 }, { "epoch": 0.17848, "grad_norm": 2.109375, "grad_norm_var": 0.01740086873372396, "learning_rate": 0.0001, "loss": 4.3035, "loss/crossentropy": 2.3023892641067505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23251917958259583, "step": 8924 }, { "epoch": 0.17852, "grad_norm": 2.015625, "grad_norm_var": 0.01822077433268229, "learning_rate": 0.0001, "loss": 4.0592, "loss/crossentropy": 2.0030421018600464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19997069239616394, "step": 8926 }, { "epoch": 0.17856, "grad_norm": 1.9921875, "grad_norm_var": 0.013741048177083333, "learning_rate": 0.0001, "loss": 4.2568, "loss/crossentropy": 2.2309017181396484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22145257890224457, "step": 8928 }, { "epoch": 0.1786, "grad_norm": 2.171875, "grad_norm_var": 0.013728841145833334, "learning_rate": 0.0001, "loss": 4.4366, "loss/crossentropy": 2.1135157346725464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21580957621335983, "step": 8930 }, { "epoch": 0.17864, "grad_norm": 2.015625, "grad_norm_var": 0.011146799723307291, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 2.098900556564331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22469021379947662, "step": 8932 }, { "epoch": 0.17868, "grad_norm": 2.21875, "grad_norm_var": 0.0103179931640625, "learning_rate": 0.0001, "loss": 4.2702, "loss/crossentropy": 2.1558337211608887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24765773862600327, "step": 8934 }, { "epoch": 0.17872, "grad_norm": 2.09375, "grad_norm_var": 0.006004842122395834, "learning_rate": 0.0001, "loss": 4.3318, "loss/crossentropy": 2.141040623188019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490675747394562, "step": 8936 }, { "epoch": 0.17876, "grad_norm": 2.171875, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 4.4126, "loss/crossentropy": 2.1036806106567383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23260055482387543, "step": 8938 }, { "epoch": 0.1788, "grad_norm": 1.984375, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 4.4151, "loss/crossentropy": 1.9403663277626038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21941428631544113, "step": 8940 }, { "epoch": 0.17884, "grad_norm": 2.03125, "grad_norm_var": 0.014717610677083333, "learning_rate": 0.0001, "loss": 4.2, "loss/crossentropy": 1.8589079976081848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21533852070569992, "step": 8942 }, { "epoch": 0.17888, "grad_norm": 2.21875, "grad_norm_var": 0.013816070556640626, "learning_rate": 0.0001, "loss": 4.2476, "loss/crossentropy": 2.3136903643608093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23139026761054993, "step": 8944 }, { "epoch": 0.17892, "grad_norm": 2.109375, "grad_norm_var": 0.012835439046223958, "learning_rate": 0.0001, "loss": 4.2669, "loss/crossentropy": 2.305663585662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24263737350702286, "step": 8946 }, { "epoch": 0.17896, "grad_norm": 2.5625, "grad_norm_var": 0.02195002237955729, "learning_rate": 0.0001, "loss": 4.3059, "loss/crossentropy": 2.0450612902641296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124529778957367, "step": 8948 }, { "epoch": 0.179, "grad_norm": 2.3125, "grad_norm_var": 0.020393880208333333, "learning_rate": 0.0001, "loss": 4.4229, "loss/crossentropy": 2.435065984725952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24603386223316193, "step": 8950 }, { "epoch": 0.17904, "grad_norm": 2.234375, "grad_norm_var": 0.02295099894205729, "learning_rate": 0.0001, "loss": 4.2604, "loss/crossentropy": 2.092818021774292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20905248820781708, "step": 8952 }, { "epoch": 0.17908, "grad_norm": 2.109375, "grad_norm_var": 0.024621327718098957, "learning_rate": 0.0001, "loss": 4.2539, "loss/crossentropy": 2.08588969707489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142588049173355, "step": 8954 }, { "epoch": 0.17912, "grad_norm": 2.25, "grad_norm_var": 0.021144358317057292, "learning_rate": 0.0001, "loss": 4.3958, "loss/crossentropy": 1.9161878824234009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20626582205295563, "step": 8956 }, { "epoch": 0.17916, "grad_norm": 2.078125, "grad_norm_var": 0.020182037353515626, "learning_rate": 0.0001, "loss": 4.3726, "loss/crossentropy": 2.3072937726974487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22028075903654099, "step": 8958 }, { "epoch": 0.1792, "grad_norm": 2.15625, "grad_norm_var": 0.021109771728515626, "learning_rate": 0.0001, "loss": 4.4876, "loss/crossentropy": 2.3053938150405884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22212930023670197, "step": 8960 }, { "epoch": 0.17924, "grad_norm": 2.0625, "grad_norm_var": 0.02269261678059896, "learning_rate": 0.0001, "loss": 4.5137, "loss/crossentropy": 1.9130414128303528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20036083459854126, "step": 8962 }, { "epoch": 0.17928, "grad_norm": 2.0, "grad_norm_var": 0.013242340087890625, "learning_rate": 0.0001, "loss": 4.1299, "loss/crossentropy": 2.3808066844940186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116570845246315, "step": 8964 }, { "epoch": 0.17932, "grad_norm": 2.125, "grad_norm_var": 0.009549713134765625, "learning_rate": 0.0001, "loss": 4.296, "loss/crossentropy": 2.180716395378113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275484874844551, "step": 8966 }, { "epoch": 0.17936, "grad_norm": 1.984375, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 4.2597, "loss/crossentropy": 2.027850866317749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011229619383812, "step": 8968 }, { "epoch": 0.1794, "grad_norm": 2.171875, "grad_norm_var": 0.017447916666666667, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.077622890472412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21361806988716125, "step": 8970 }, { "epoch": 0.17944, "grad_norm": 2.03125, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 4.1972, "loss/crossentropy": 2.004905104637146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110886335372925, "step": 8972 }, { "epoch": 0.17948, "grad_norm": 2.21875, "grad_norm_var": 0.019466145833333334, "learning_rate": 0.0001, "loss": 4.5026, "loss/crossentropy": 2.1859925389289856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23011507838964462, "step": 8974 }, { "epoch": 0.17952, "grad_norm": 2.21875, "grad_norm_var": 0.019840494791666666, "learning_rate": 0.0001, "loss": 4.2908, "loss/crossentropy": 1.8372295498847961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171105071902275, "step": 8976 }, { "epoch": 0.17956, "grad_norm": 2.328125, "grad_norm_var": 0.020197550455729168, "learning_rate": 0.0001, "loss": 4.3887, "loss/crossentropy": 2.127313494682312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22017831355333328, "step": 8978 }, { "epoch": 0.1796, "grad_norm": 2.421875, "grad_norm_var": 0.022098795572916666, "learning_rate": 0.0001, "loss": 4.3389, "loss/crossentropy": 2.121580421924591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24898843467235565, "step": 8980 }, { "epoch": 0.17964, "grad_norm": 2.109375, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 4.1127, "loss/crossentropy": 2.0973563194274902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21355029940605164, "step": 8982 }, { "epoch": 0.17968, "grad_norm": 2.125, "grad_norm_var": 0.018896484375, "learning_rate": 0.0001, "loss": 4.3898, "loss/crossentropy": 2.1109927892684937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096879929304123, "step": 8984 }, { "epoch": 0.17972, "grad_norm": 2.078125, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 4.1264, "loss/crossentropy": 1.954129159450531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20745816081762314, "step": 8986 }, { "epoch": 0.17976, "grad_norm": 2.390625, "grad_norm_var": 0.013483683268229166, "learning_rate": 0.0001, "loss": 4.5357, "loss/crossentropy": 1.875806748867035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19150983542203903, "step": 8988 }, { "epoch": 0.1798, "grad_norm": 2.109375, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 4.3042, "loss/crossentropy": 2.3101617097854614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2393202781677246, "step": 8990 }, { "epoch": 0.17984, "grad_norm": 2.234375, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.4954, "loss/crossentropy": 2.3156551122665405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23346291482448578, "step": 8992 }, { "epoch": 0.17988, "grad_norm": 2.1875, "grad_norm_var": 0.013966623942057292, "learning_rate": 0.0001, "loss": 4.3675, "loss/crossentropy": 2.4437999725341797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172817811369896, "step": 8994 }, { "epoch": 0.17992, "grad_norm": 2.046875, "grad_norm_var": 0.009437815348307291, "learning_rate": 0.0001, "loss": 4.3122, "loss/crossentropy": 2.0451250076293945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22556670010089874, "step": 8996 }, { "epoch": 0.17996, "grad_norm": 2.21875, "grad_norm_var": 0.009852854410807292, "learning_rate": 0.0001, "loss": 4.5654, "loss/crossentropy": 2.3135393857955933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2313593551516533, "step": 8998 }, { "epoch": 0.18, "grad_norm": 2.125, "grad_norm_var": 0.010530344645182292, "learning_rate": 0.0001, "loss": 4.3001, "loss/crossentropy": 1.9934805035591125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20815817266702652, "step": 9000 }, { "epoch": 0.18004, "grad_norm": 2.125, "grad_norm_var": 0.011982981363932292, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.1770662665367126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23489046096801758, "step": 9002 }, { "epoch": 0.18008, "grad_norm": 2.3125, "grad_norm_var": 0.009779612223307291, "learning_rate": 0.0001, "loss": 4.3708, "loss/crossentropy": 1.9791364073753357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139936238527298, "step": 9004 }, { "epoch": 0.18012, "grad_norm": 2.015625, "grad_norm_var": 0.011445871988932292, "learning_rate": 0.0001, "loss": 4.298, "loss/crossentropy": 2.2092931270599365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22620443254709244, "step": 9006 }, { "epoch": 0.18016, "grad_norm": 2.171875, "grad_norm_var": 0.011034901936848958, "learning_rate": 0.0001, "loss": 4.3114, "loss/crossentropy": 2.123443365097046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21943332999944687, "step": 9008 }, { "epoch": 0.1802, "grad_norm": 2.09375, "grad_norm_var": 0.009212239583333334, "learning_rate": 0.0001, "loss": 4.4666, "loss/crossentropy": 2.243329405784607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24108020961284637, "step": 9010 }, { "epoch": 0.18024, "grad_norm": 2.1875, "grad_norm_var": 0.008426920572916666, "learning_rate": 0.0001, "loss": 4.3687, "loss/crossentropy": 2.366227388381958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21806316077709198, "step": 9012 }, { "epoch": 0.18028, "grad_norm": 2.03125, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 3.8255, "loss/crossentropy": 1.768812358379364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861925944685936, "step": 9014 }, { "epoch": 0.18032, "grad_norm": 1.9296875, "grad_norm_var": 0.012225087483723958, "learning_rate": 0.0001, "loss": 4.1236, "loss/crossentropy": 1.9376537799835205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18691913783550262, "step": 9016 }, { "epoch": 0.18036, "grad_norm": 2.140625, "grad_norm_var": 0.010709381103515625, "learning_rate": 0.0001, "loss": 4.2748, "loss/crossentropy": 2.3026299476623535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23757526278495789, "step": 9018 }, { "epoch": 0.1804, "grad_norm": 1.9453125, "grad_norm_var": 0.010057576497395833, "learning_rate": 0.0001, "loss": 4.0026, "loss/crossentropy": 1.9697216153144836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21453910320997238, "step": 9020 }, { "epoch": 0.18044, "grad_norm": 2.0625, "grad_norm_var": 0.011058553059895834, "learning_rate": 0.0001, "loss": 4.5457, "loss/crossentropy": 2.257638931274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24236120283603668, "step": 9022 }, { "epoch": 0.18048, "grad_norm": 2.0625, "grad_norm_var": 0.0115142822265625, "learning_rate": 0.0001, "loss": 4.3323, "loss/crossentropy": 2.244320869445801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25429578125476837, "step": 9024 }, { "epoch": 0.18052, "grad_norm": 2.21875, "grad_norm_var": 0.012214152018229167, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.1105872988700867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23081901669502258, "step": 9026 }, { "epoch": 0.18056, "grad_norm": 1.9921875, "grad_norm_var": 0.012839508056640626, "learning_rate": 0.0001, "loss": 4.2561, "loss/crossentropy": 1.9657647609710693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22111424803733826, "step": 9028 }, { "epoch": 0.1806, "grad_norm": 2.125, "grad_norm_var": 0.012308502197265625, "learning_rate": 0.0001, "loss": 4.4019, "loss/crossentropy": 2.0759438276290894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22763221710920334, "step": 9030 }, { "epoch": 0.18064, "grad_norm": 2.125, "grad_norm_var": 0.010888417561848959, "learning_rate": 0.0001, "loss": 3.9999, "loss/crossentropy": 2.0250572562217712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279352843761444, "step": 9032 }, { "epoch": 0.18068, "grad_norm": 2.09375, "grad_norm_var": 0.010534413655598958, "learning_rate": 0.0001, "loss": 4.3772, "loss/crossentropy": 2.270031213760376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23242096602916718, "step": 9034 }, { "epoch": 0.18072, "grad_norm": 2.25, "grad_norm_var": 0.011922200520833334, "learning_rate": 0.0001, "loss": 4.4304, "loss/crossentropy": 2.10041344165802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23192601650953293, "step": 9036 }, { "epoch": 0.18076, "grad_norm": 2.15625, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.1041, "loss/crossentropy": 2.0255953073501587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23707614094018936, "step": 9038 }, { "epoch": 0.1808, "grad_norm": 1.9453125, "grad_norm_var": 0.010406239827473959, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.8162729740142822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19466694444417953, "step": 9040 }, { "epoch": 0.18084, "grad_norm": 2.203125, "grad_norm_var": 0.010170237223307291, "learning_rate": 0.0001, "loss": 4.5543, "loss/crossentropy": 2.1271599531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22750889509916306, "step": 9042 }, { "epoch": 0.18088, "grad_norm": 2.171875, "grad_norm_var": 0.009447224934895833, "learning_rate": 0.0001, "loss": 4.5377, "loss/crossentropy": 2.3638603687286377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2456393539905548, "step": 9044 }, { "epoch": 0.18092, "grad_norm": 2.125, "grad_norm_var": 0.019437662760416665, "learning_rate": 0.0001, "loss": 4.3413, "loss/crossentropy": 1.5851669907569885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20994101464748383, "step": 9046 }, { "epoch": 0.18096, "grad_norm": 2.1875, "grad_norm_var": 0.017789459228515624, "learning_rate": 0.0001, "loss": 4.2284, "loss/crossentropy": 1.7990906834602356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18919725716114044, "step": 9048 }, { "epoch": 0.181, "grad_norm": 2.28125, "grad_norm_var": 0.017490386962890625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 2.0975595712661743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23837832361459732, "step": 9050 }, { "epoch": 0.18104, "grad_norm": 2.0625, "grad_norm_var": 0.018070475260416666, "learning_rate": 0.0001, "loss": 4.1304, "loss/crossentropy": 2.1970856189727783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23219536244869232, "step": 9052 }, { "epoch": 0.18108, "grad_norm": 2.15625, "grad_norm_var": 0.017292277018229166, "learning_rate": 0.0001, "loss": 4.211, "loss/crossentropy": 2.0846009850502014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775136351585388, "step": 9054 }, { "epoch": 0.18112, "grad_norm": 2.0625, "grad_norm_var": 0.015075429280598959, "learning_rate": 0.0001, "loss": 4.0415, "loss/crossentropy": 1.662496030330658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18223516643047333, "step": 9056 }, { "epoch": 0.18116, "grad_norm": 2.046875, "grad_norm_var": 0.016078440348307292, "learning_rate": 0.0001, "loss": 4.2561, "loss/crossentropy": 2.126902401447296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193778082728386, "step": 9058 }, { "epoch": 0.1812, "grad_norm": 2.984375, "grad_norm_var": 0.062459309895833336, "learning_rate": 0.0001, "loss": 4.0343, "loss/crossentropy": 1.9529814720153809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19026879966259003, "step": 9060 }, { "epoch": 0.18124, "grad_norm": 1.953125, "grad_norm_var": 0.05981852213541667, "learning_rate": 0.0001, "loss": 4.1911, "loss/crossentropy": 2.140450179576874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23864319920539856, "step": 9062 }, { "epoch": 0.18128, "grad_norm": 2.015625, "grad_norm_var": 0.06083882649739583, "learning_rate": 0.0001, "loss": 4.3442, "loss/crossentropy": 2.4139195680618286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360195592045784, "step": 9064 }, { "epoch": 0.18132, "grad_norm": 2.046875, "grad_norm_var": 0.06083882649739583, "learning_rate": 0.0001, "loss": 4.5122, "loss/crossentropy": 2.190356135368347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22671552002429962, "step": 9066 }, { "epoch": 0.18136, "grad_norm": 2.078125, "grad_norm_var": 0.05916315714518229, "learning_rate": 0.0001, "loss": 4.281, "loss/crossentropy": 1.9428812861442566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22318705916404724, "step": 9068 }, { "epoch": 0.1814, "grad_norm": 2.046875, "grad_norm_var": 0.059242502848307295, "learning_rate": 0.0001, "loss": 4.4628, "loss/crossentropy": 2.296473503112793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22005227208137512, "step": 9070 }, { "epoch": 0.18144, "grad_norm": 2.1875, "grad_norm_var": 0.05919774373372396, "learning_rate": 0.0001, "loss": 4.5848, "loss/crossentropy": 2.2118901014328003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22189343720674515, "step": 9072 }, { "epoch": 0.18148, "grad_norm": 2.15625, "grad_norm_var": 0.05810114542643229, "learning_rate": 0.0001, "loss": 4.285, "loss/crossentropy": 1.8919037580490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986190229654312, "step": 9074 }, { "epoch": 0.18152, "grad_norm": 2.171875, "grad_norm_var": 0.011161295572916667, "learning_rate": 0.0001, "loss": 4.3046, "loss/crossentropy": 1.975312054157257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20403321832418442, "step": 9076 }, { "epoch": 0.18156, "grad_norm": 2.265625, "grad_norm_var": 0.0075032552083333336, "learning_rate": 0.0001, "loss": 4.2175, "loss/crossentropy": 1.8076966404914856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19708115607500076, "step": 9078 }, { "epoch": 0.1816, "grad_norm": 2.265625, "grad_norm_var": 0.007112630208333333, "learning_rate": 0.0001, "loss": 4.2166, "loss/crossentropy": 2.101171374320984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22123181074857712, "step": 9080 }, { "epoch": 0.18164, "grad_norm": 2.1875, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 4.2292, "loss/crossentropy": 2.09942090511322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21761803328990936, "step": 9082 }, { "epoch": 0.18168, "grad_norm": 2.1875, "grad_norm_var": 0.008072916666666667, "learning_rate": 0.0001, "loss": 4.4802, "loss/crossentropy": 2.4418424367904663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24908769130706787, "step": 9084 }, { "epoch": 0.18172, "grad_norm": 2.109375, "grad_norm_var": 0.0087890625, "learning_rate": 0.0001, "loss": 4.3408, "loss/crossentropy": 1.944950520992279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515284687280655, "step": 9086 }, { "epoch": 0.18176, "grad_norm": 2.21875, "grad_norm_var": 0.007991536458333334, "learning_rate": 0.0001, "loss": 4.3117, "loss/crossentropy": 1.870418667793274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20427027344703674, "step": 9088 }, { "epoch": 0.1818, "grad_norm": 2.171875, "grad_norm_var": 0.007682291666666666, "learning_rate": 0.0001, "loss": 4.1323, "loss/crossentropy": 1.9338520169258118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19928501546382904, "step": 9090 }, { "epoch": 0.18184, "grad_norm": 1.9609375, "grad_norm_var": 0.008876291910807292, "learning_rate": 0.0001, "loss": 3.9734, "loss/crossentropy": 1.7826221585273743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19778436422348022, "step": 9092 }, { "epoch": 0.18188, "grad_norm": 2.078125, "grad_norm_var": 0.007458241780598959, "learning_rate": 0.0001, "loss": 4.3537, "loss/crossentropy": 2.2922680377960205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23613491654396057, "step": 9094 }, { "epoch": 0.18192, "grad_norm": 2.296875, "grad_norm_var": 0.15102513631184897, "learning_rate": 0.0001, "loss": 4.6789, "loss/crossentropy": 2.1802788972854614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969536542892456, "step": 9096 }, { "epoch": 0.18196, "grad_norm": 2.15625, "grad_norm_var": 0.1498308817545573, "learning_rate": 0.0001, "loss": 4.1715, "loss/crossentropy": 1.9129992723464966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942882016301155, "step": 9098 }, { "epoch": 0.182, "grad_norm": 2.203125, "grad_norm_var": 0.14738337198893228, "learning_rate": 0.0001, "loss": 4.3374, "loss/crossentropy": 1.8288249969482422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879616051912308, "step": 9100 }, { "epoch": 0.18204, "grad_norm": 2.3125, "grad_norm_var": 0.14580663045247397, "learning_rate": 0.0001, "loss": 4.2824, "loss/crossentropy": 2.039812684059143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21374420076608658, "step": 9102 }, { "epoch": 0.18208, "grad_norm": 1.9609375, "grad_norm_var": 0.15449193318684895, "learning_rate": 0.0001, "loss": 4.1862, "loss/crossentropy": 2.1177414059638977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193453460931778, "step": 9104 }, { "epoch": 0.18212, "grad_norm": 2.28125, "grad_norm_var": 0.16704076131184895, "learning_rate": 0.0001, "loss": 4.4869, "loss/crossentropy": 2.171097159385681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21211445331573486, "step": 9106 }, { "epoch": 0.18216, "grad_norm": 2.359375, "grad_norm_var": 0.16413548787434895, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 1.9781638383865356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21307373046875, "step": 9108 }, { "epoch": 0.1822, "grad_norm": 1.984375, "grad_norm_var": 0.17157363891601562, "learning_rate": 0.0001, "loss": 4.2454, "loss/crossentropy": 2.0868560075759888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20633937418460846, "step": 9110 }, { "epoch": 0.18224, "grad_norm": 2.03125, "grad_norm_var": 0.04592463175455729, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.079172134399414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20709815621376038, "step": 9112 }, { "epoch": 0.18228, "grad_norm": 2.359375, "grad_norm_var": 0.04835383097330729, "learning_rate": 0.0001, "loss": 4.1836, "loss/crossentropy": 2.100913643836975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831409633159637, "step": 9114 }, { "epoch": 0.18232, "grad_norm": 2.203125, "grad_norm_var": 0.049478912353515626, "learning_rate": 0.0001, "loss": 4.4087, "loss/crossentropy": 2.018395781517029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22545243054628372, "step": 9116 }, { "epoch": 0.18236, "grad_norm": 2.203125, "grad_norm_var": 0.04278132120768229, "learning_rate": 0.0001, "loss": 4.3421, "loss/crossentropy": 1.9177632331848145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23346271365880966, "step": 9118 }, { "epoch": 0.1824, "grad_norm": 2.0625, "grad_norm_var": 0.038358306884765624, "learning_rate": 0.0001, "loss": 3.9924, "loss/crossentropy": 2.1122357845306396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22593770176172256, "step": 9120 }, { "epoch": 0.18244, "grad_norm": 2.03125, "grad_norm_var": 0.014021555582682291, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.7920495867729187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127150148153305, "step": 9122 }, { "epoch": 0.18248, "grad_norm": 2.1875, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 4.2061, "loss/crossentropy": 2.2180920839309692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25088224560022354, "step": 9124 }, { "epoch": 0.18252, "grad_norm": 2.1875, "grad_norm_var": 0.008185831705729167, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 1.8999969959259033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616832166910172, "step": 9126 }, { "epoch": 0.18256, "grad_norm": 2.125, "grad_norm_var": 0.0081451416015625, "learning_rate": 0.0001, "loss": 4.5351, "loss/crossentropy": 1.9424527287483215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22016742825508118, "step": 9128 }, { "epoch": 0.1826, "grad_norm": 2.0625, "grad_norm_var": 0.0049468994140625, "learning_rate": 0.0001, "loss": 4.4432, "loss/crossentropy": 2.0539366006851196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26818516850471497, "step": 9130 }, { "epoch": 0.18264, "grad_norm": 2.0625, "grad_norm_var": 0.00458984375, "learning_rate": 0.0001, "loss": 4.1634, "loss/crossentropy": 1.9180519580841064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067112922668457, "step": 9132 }, { "epoch": 0.18268, "grad_norm": 2.078125, "grad_norm_var": 0.005052693684895833, "learning_rate": 0.0001, "loss": 3.9913, "loss/crossentropy": 1.7417545318603516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970825269818306, "step": 9134 }, { "epoch": 0.18272, "grad_norm": 2.09375, "grad_norm_var": 0.004833984375, "learning_rate": 0.0001, "loss": 4.016, "loss/crossentropy": 1.9498217701911926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281737089157104, "step": 9136 }, { "epoch": 0.18276, "grad_norm": 2.21875, "grad_norm_var": 0.005615234375, "learning_rate": 0.0001, "loss": 4.3842, "loss/crossentropy": 2.140692949295044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24291902035474777, "step": 9138 }, { "epoch": 0.1828, "grad_norm": 1.984375, "grad_norm_var": 0.0069976806640625, "learning_rate": 0.0001, "loss": 4.1785, "loss/crossentropy": 2.3510342836380005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23326712846755981, "step": 9140 }, { "epoch": 0.18284, "grad_norm": 2.140625, "grad_norm_var": 0.0072743733723958336, "learning_rate": 0.0001, "loss": 4.0707, "loss/crossentropy": 2.049591898918152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21118033677339554, "step": 9142 }, { "epoch": 0.18288, "grad_norm": 2.109375, "grad_norm_var": 0.005631510416666667, "learning_rate": 0.0001, "loss": 4.263, "loss/crossentropy": 1.949703335762024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22161198407411575, "step": 9144 }, { "epoch": 0.18292, "grad_norm": 2.109375, "grad_norm_var": 0.005399576822916667, "learning_rate": 0.0001, "loss": 4.5602, "loss/crossentropy": 2.1442413330078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281076222658157, "step": 9146 }, { "epoch": 0.18296, "grad_norm": 2.078125, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 4.0848, "loss/crossentropy": 2.103494882583618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20892268419265747, "step": 9148 }, { "epoch": 0.183, "grad_norm": 2.25, "grad_norm_var": 0.01060791015625, "learning_rate": 0.0001, "loss": 4.133, "loss/crossentropy": 2.1544495224952698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008143737912178, "step": 9150 }, { "epoch": 0.18304, "grad_norm": 1.984375, "grad_norm_var": 0.01109619140625, "learning_rate": 0.0001, "loss": 4.2799, "loss/crossentropy": 2.021821677684784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20975126326084137, "step": 9152 }, { "epoch": 0.18308, "grad_norm": 1.921875, "grad_norm_var": 0.0102203369140625, "learning_rate": 0.0001, "loss": 4.2858, "loss/crossentropy": 2.109215199947357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19855067878961563, "step": 9154 }, { "epoch": 0.18312, "grad_norm": 2.09375, "grad_norm_var": 0.011165364583333334, "learning_rate": 0.0001, "loss": 4.2181, "loss/crossentropy": 1.7631941437721252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20584283769130707, "step": 9156 }, { "epoch": 0.18316, "grad_norm": 2.171875, "grad_norm_var": 0.0116851806640625, "learning_rate": 0.0001, "loss": 4.4005, "loss/crossentropy": 2.131524443626404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151477411389351, "step": 9158 }, { "epoch": 0.1832, "grad_norm": 2.015625, "grad_norm_var": 0.01334228515625, "learning_rate": 0.0001, "loss": 4.2729, "loss/crossentropy": 2.018375277519226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21982873231172562, "step": 9160 }, { "epoch": 0.18324, "grad_norm": 2.09375, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 4.3302, "loss/crossentropy": 2.217617154121399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2257251739501953, "step": 9162 }, { "epoch": 0.18328, "grad_norm": 2.15625, "grad_norm_var": 0.014606730143229166, "learning_rate": 0.0001, "loss": 4.227, "loss/crossentropy": 1.8632460832595825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19728046655654907, "step": 9164 }, { "epoch": 0.18332, "grad_norm": 2.0625, "grad_norm_var": 0.012137858072916667, "learning_rate": 0.0001, "loss": 4.5335, "loss/crossentropy": 2.2818111181259155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22397568821907043, "step": 9166 }, { "epoch": 0.18336, "grad_norm": 2.109375, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 4.4104, "loss/crossentropy": 2.1209938526153564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490206360816956, "step": 9168 }, { "epoch": 0.1834, "grad_norm": 2.046875, "grad_norm_var": 0.00924072265625, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 2.141623795032501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710850298404694, "step": 9170 }, { "epoch": 0.18344, "grad_norm": 2.125, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 4.2063, "loss/crossentropy": 2.165239691734314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.235738106071949, "step": 9172 }, { "epoch": 0.18348, "grad_norm": 2.015625, "grad_norm_var": 0.008103179931640624, "learning_rate": 0.0001, "loss": 4.0186, "loss/crossentropy": 1.8649475574493408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088451236486435, "step": 9174 }, { "epoch": 0.18352, "grad_norm": 2.15625, "grad_norm_var": 0.007012685139973958, "learning_rate": 0.0001, "loss": 4.2853, "loss/crossentropy": 2.312218189239502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.236283540725708, "step": 9176 }, { "epoch": 0.18356, "grad_norm": 2.109375, "grad_norm_var": 0.008699544270833333, "learning_rate": 0.0001, "loss": 3.9626, "loss/crossentropy": 2.0149282217025757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21434535831212997, "step": 9178 }, { "epoch": 0.1836, "grad_norm": 2.125, "grad_norm_var": 0.0086822509765625, "learning_rate": 0.0001, "loss": 4.5081, "loss/crossentropy": 2.499966621398926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26318275928497314, "step": 9180 }, { "epoch": 0.18364, "grad_norm": 2.140625, "grad_norm_var": 0.0084136962890625, "learning_rate": 0.0001, "loss": 4.4498, "loss/crossentropy": 2.0454984307289124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202616035938263, "step": 9182 }, { "epoch": 0.18368, "grad_norm": 1.9609375, "grad_norm_var": 0.010835520426432292, "learning_rate": 0.0001, "loss": 4.0754, "loss/crossentropy": 2.1708725094795227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20958629250526428, "step": 9184 }, { "epoch": 0.18372, "grad_norm": 2.140625, "grad_norm_var": 0.010792795817057292, "learning_rate": 0.0001, "loss": 4.2104, "loss/crossentropy": 1.8261350989341736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952563151717186, "step": 9186 }, { "epoch": 0.18376, "grad_norm": 2.046875, "grad_norm_var": 0.010009511311848959, "learning_rate": 0.0001, "loss": 4.1247, "loss/crossentropy": 2.036627769470215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22490298002958298, "step": 9188 }, { "epoch": 0.1838, "grad_norm": 1.9765625, "grad_norm_var": 0.010501861572265625, "learning_rate": 0.0001, "loss": 4.0088, "loss/crossentropy": 1.7977086305618286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19282807409763336, "step": 9190 }, { "epoch": 0.18384, "grad_norm": 2.21875, "grad_norm_var": 0.011211903889973958, "learning_rate": 0.0001, "loss": 4.1774, "loss/crossentropy": 2.170135021209717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23149186372756958, "step": 9192 }, { "epoch": 0.18388, "grad_norm": 2.328125, "grad_norm_var": 0.012035115559895834, "learning_rate": 0.0001, "loss": 4.4155, "loss/crossentropy": 2.1453020572662354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25120319426059723, "step": 9194 }, { "epoch": 0.18392, "grad_norm": 2.125, "grad_norm_var": 0.01224365234375, "learning_rate": 0.0001, "loss": 4.3628, "loss/crossentropy": 1.8794063925743103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20883548259735107, "step": 9196 }, { "epoch": 0.18396, "grad_norm": 2.3125, "grad_norm_var": 0.0141998291015625, "learning_rate": 0.0001, "loss": 4.3384, "loss/crossentropy": 2.021254241466522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21939975768327713, "step": 9198 }, { "epoch": 0.184, "grad_norm": 2.09375, "grad_norm_var": 0.009557851155598958, "learning_rate": 0.0001, "loss": 4.158, "loss/crossentropy": 2.195580303668976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108924463391304, "step": 9200 }, { "epoch": 0.18404, "grad_norm": 2.21875, "grad_norm_var": 0.014085896809895833, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 1.6260902881622314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20428159832954407, "step": 9202 }, { "epoch": 0.18408, "grad_norm": 1.96875, "grad_norm_var": 0.015363566080729167, "learning_rate": 0.0001, "loss": 4.2525, "loss/crossentropy": 2.138678550720215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21641074120998383, "step": 9204 }, { "epoch": 0.18412, "grad_norm": 2.125, "grad_norm_var": 0.01825129191080729, "learning_rate": 0.0001, "loss": 4.4706, "loss/crossentropy": 2.047194480895996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20891964435577393, "step": 9206 }, { "epoch": 0.18416, "grad_norm": 2.171875, "grad_norm_var": 0.019606272379557293, "learning_rate": 0.0001, "loss": 4.2312, "loss/crossentropy": 2.189277768135071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150397077202797, "step": 9208 }, { "epoch": 0.1842, "grad_norm": 2.140625, "grad_norm_var": 0.017618560791015626, "learning_rate": 0.0001, "loss": 4.2792, "loss/crossentropy": 2.184122920036316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23278063535690308, "step": 9210 }, { "epoch": 0.18424, "grad_norm": 1.9921875, "grad_norm_var": 0.019139607747395832, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 2.413718104362488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24333150684833527, "step": 9212 }, { "epoch": 0.18428, "grad_norm": 2.078125, "grad_norm_var": 0.019017537434895832, "learning_rate": 0.0001, "loss": 3.8486, "loss/crossentropy": 1.8086814880371094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976364076137543, "step": 9214 }, { "epoch": 0.18432, "grad_norm": 2.09375, "grad_norm_var": 0.019978841145833332, "learning_rate": 0.0001, "loss": 4.5222, "loss/crossentropy": 2.2418206930160522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229690782725811, "step": 9216 }, { "epoch": 0.18436, "grad_norm": 2.015625, "grad_norm_var": 0.016355133056640624, "learning_rate": 0.0001, "loss": 4.0817, "loss/crossentropy": 1.8083258867263794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20999443531036377, "step": 9218 }, { "epoch": 0.1844, "grad_norm": 1.953125, "grad_norm_var": 0.01693115234375, "learning_rate": 0.0001, "loss": 3.772, "loss/crossentropy": 1.8117709755897522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436270534992218, "step": 9220 }, { "epoch": 0.18444, "grad_norm": 2.15625, "grad_norm_var": 0.010550944010416667, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.024270534515381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20097267627716064, "step": 9222 }, { "epoch": 0.18448, "grad_norm": 2.25, "grad_norm_var": 0.011449178059895834, "learning_rate": 0.0001, "loss": 4.4756, "loss/crossentropy": 2.2385981678962708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21131044626235962, "step": 9224 }, { "epoch": 0.18452, "grad_norm": 2.140625, "grad_norm_var": 0.011271158854166666, "learning_rate": 0.0001, "loss": 4.3745, "loss/crossentropy": 2.127749502658844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22855369001626968, "step": 9226 }, { "epoch": 0.18456, "grad_norm": 2.15625, "grad_norm_var": 0.011237589518229167, "learning_rate": 0.0001, "loss": 4.2645, "loss/crossentropy": 2.1877033710479736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2339663878083229, "step": 9228 }, { "epoch": 0.1846, "grad_norm": 2.25, "grad_norm_var": 0.011139933268229167, "learning_rate": 0.0001, "loss": 4.2642, "loss/crossentropy": 1.931507408618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912021398544312, "step": 9230 }, { "epoch": 0.18464, "grad_norm": 2.234375, "grad_norm_var": 0.011579386393229167, "learning_rate": 0.0001, "loss": 4.3309, "loss/crossentropy": 1.8101251125335693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23724676668643951, "step": 9232 }, { "epoch": 0.18468, "grad_norm": 2.125, "grad_norm_var": 0.0111480712890625, "learning_rate": 0.0001, "loss": 4.201, "loss/crossentropy": 2.015208065509796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23482007533311844, "step": 9234 }, { "epoch": 0.18472, "grad_norm": 2.1875, "grad_norm_var": 0.010282135009765625, "learning_rate": 0.0001, "loss": 4.5357, "loss/crossentropy": 2.2742738723754883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23632052540779114, "step": 9236 }, { "epoch": 0.18476, "grad_norm": 2.46875, "grad_norm_var": 0.016806793212890626, "learning_rate": 0.0001, "loss": 4.3574, "loss/crossentropy": 1.7254774570465088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104022428393364, "step": 9238 }, { "epoch": 0.1848, "grad_norm": 2.09375, "grad_norm_var": 0.016585032145182293, "learning_rate": 0.0001, "loss": 4.3477, "loss/crossentropy": 2.181770443916321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22411521524190903, "step": 9240 }, { "epoch": 0.18484, "grad_norm": 1.953125, "grad_norm_var": 0.020157877604166666, "learning_rate": 0.0001, "loss": 3.9517, "loss/crossentropy": 1.8449691534042358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091139778494835, "step": 9242 }, { "epoch": 0.18488, "grad_norm": 2.1875, "grad_norm_var": 0.017195383707682293, "learning_rate": 0.0001, "loss": 4.3389, "loss/crossentropy": 2.0917609333992004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2644127458333969, "step": 9244 }, { "epoch": 0.18492, "grad_norm": 2.28125, "grad_norm_var": 0.01793390909830729, "learning_rate": 0.0001, "loss": 4.083, "loss/crossentropy": 2.060012102127075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21294714510440826, "step": 9246 }, { "epoch": 0.18496, "grad_norm": 2.078125, "grad_norm_var": 0.018387603759765624, "learning_rate": 0.0001, "loss": 4.6598, "loss/crossentropy": 2.059940278530121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22648683190345764, "step": 9248 }, { "epoch": 0.185, "grad_norm": 2.078125, "grad_norm_var": 0.01862360636393229, "learning_rate": 0.0001, "loss": 4.4696, "loss/crossentropy": 1.8423291444778442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22509171068668365, "step": 9250 }, { "epoch": 0.18504, "grad_norm": 2.125, "grad_norm_var": 0.016410064697265626, "learning_rate": 0.0001, "loss": 4.4629, "loss/crossentropy": 2.2559698820114136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23896963894367218, "step": 9252 }, { "epoch": 0.18508, "grad_norm": 2.03125, "grad_norm_var": 0.01579767862955729, "learning_rate": 0.0001, "loss": 4.6027, "loss/crossentropy": 2.0583658814430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22763275355100632, "step": 9254 }, { "epoch": 0.18512, "grad_norm": 2.015625, "grad_norm_var": 0.016658274332682292, "learning_rate": 0.0001, "loss": 4.2654, "loss/crossentropy": 1.816649854183197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18608735501766205, "step": 9256 }, { "epoch": 0.18516, "grad_norm": 4.65625, "grad_norm_var": 0.4073150634765625, "learning_rate": 0.0001, "loss": 4.2837, "loss/crossentropy": 2.0920958518981934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22859029471874237, "step": 9258 }, { "epoch": 0.1852, "grad_norm": 2.125, "grad_norm_var": 0.4108306884765625, "learning_rate": 0.0001, "loss": 4.2517, "loss/crossentropy": 2.1475982666015625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21678777784109116, "step": 9260 }, { "epoch": 0.18524, "grad_norm": 2.21875, "grad_norm_var": 0.4100901285807292, "learning_rate": 0.0001, "loss": 4.4258, "loss/crossentropy": 2.22346031665802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333592176437378, "step": 9262 }, { "epoch": 0.18528, "grad_norm": 2.09375, "grad_norm_var": 0.41646703084309894, "learning_rate": 0.0001, "loss": 4.2134, "loss/crossentropy": 1.7652028799057007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18461769074201584, "step": 9264 }, { "epoch": 0.18532, "grad_norm": 2.109375, "grad_norm_var": 0.4225006103515625, "learning_rate": 0.0001, "loss": 4.2362, "loss/crossentropy": 2.1549625396728516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22428707033395767, "step": 9266 }, { "epoch": 0.18536, "grad_norm": 2.0, "grad_norm_var": 0.4281972249348958, "learning_rate": 0.0001, "loss": 4.3685, "loss/crossentropy": 2.1677842140197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22887174785137177, "step": 9268 }, { "epoch": 0.1854, "grad_norm": 2.09375, "grad_norm_var": 0.42428792317708336, "learning_rate": 0.0001, "loss": 4.1517, "loss/crossentropy": 1.8352131247520447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19628287106752396, "step": 9270 }, { "epoch": 0.18544, "grad_norm": 2.109375, "grad_norm_var": 0.43038304646809894, "learning_rate": 0.0001, "loss": 4.101, "loss/crossentropy": 2.104279100894928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20481227338314056, "step": 9272 }, { "epoch": 0.18548, "grad_norm": 2.21875, "grad_norm_var": 0.009795888264973959, "learning_rate": 0.0001, "loss": 4.2011, "loss/crossentropy": 1.8447301387786865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227916270494461, "step": 9274 }, { "epoch": 0.18552, "grad_norm": 2.296875, "grad_norm_var": 0.013651275634765625, "learning_rate": 0.0001, "loss": 4.3544, "loss/crossentropy": 2.303207755088806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24895529448986053, "step": 9276 }, { "epoch": 0.18556, "grad_norm": 2.171875, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 4.699, "loss/crossentropy": 2.248077630996704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23077847063541412, "step": 9278 }, { "epoch": 0.1856, "grad_norm": 2.34375, "grad_norm_var": 0.017438761393229165, "learning_rate": 0.0001, "loss": 4.4943, "loss/crossentropy": 2.261389970779419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23669905960559845, "step": 9280 }, { "epoch": 0.18564, "grad_norm": 2.234375, "grad_norm_var": 0.016056060791015625, "learning_rate": 0.0001, "loss": 4.3805, "loss/crossentropy": 2.402338147163391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2453952580690384, "step": 9282 }, { "epoch": 0.18568, "grad_norm": 2.265625, "grad_norm_var": 0.01587702433268229, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.011174201965332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22673364728689194, "step": 9284 }, { "epoch": 0.18572, "grad_norm": 2.234375, "grad_norm_var": 0.016810862223307292, "learning_rate": 0.0001, "loss": 4.3897, "loss/crossentropy": 1.9700093269348145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22711393237113953, "step": 9286 }, { "epoch": 0.18576, "grad_norm": 2.140625, "grad_norm_var": 0.014922841389973959, "learning_rate": 0.0001, "loss": 3.9609, "loss/crossentropy": 2.0253939032554626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21902555227279663, "step": 9288 }, { "epoch": 0.1858, "grad_norm": 2.296875, "grad_norm_var": 0.016961415608723957, "learning_rate": 0.0001, "loss": 4.1799, "loss/crossentropy": 1.741984784603119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879509538412094, "step": 9290 }, { "epoch": 0.18584, "grad_norm": 2.296875, "grad_norm_var": 0.022564442952473958, "learning_rate": 0.0001, "loss": 4.7983, "loss/crossentropy": 2.3156943321228027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2687100023031235, "step": 9292 }, { "epoch": 0.18588, "grad_norm": 2.015625, "grad_norm_var": 0.025833892822265624, "learning_rate": 0.0001, "loss": 4.2894, "loss/crossentropy": 2.0826371908187866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21234872937202454, "step": 9294 }, { "epoch": 0.18592, "grad_norm": 2.078125, "grad_norm_var": 0.025921376546223958, "learning_rate": 0.0001, "loss": 4.6473, "loss/crossentropy": 2.4080610275268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2670409381389618, "step": 9296 }, { "epoch": 0.18596, "grad_norm": 2.046875, "grad_norm_var": 0.02958958943684896, "learning_rate": 0.0001, "loss": 4.1997, "loss/crossentropy": 1.722270905971527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20094333589076996, "step": 9298 }, { "epoch": 0.186, "grad_norm": 2.03125, "grad_norm_var": 0.028527577718098957, "learning_rate": 0.0001, "loss": 4.3247, "loss/crossentropy": 2.0514711141586304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209539495408535, "step": 9300 }, { "epoch": 0.18604, "grad_norm": 2.640625, "grad_norm_var": 2.5507850646972656, "learning_rate": 0.0001, "loss": 4.9819, "loss/crossentropy": 2.4808409214019775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2367517650127411, "step": 9302 }, { "epoch": 0.18608, "grad_norm": 2.078125, "grad_norm_var": 2.536018880208333, "learning_rate": 0.0001, "loss": 4.1105, "loss/crossentropy": 2.2539944648742676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22834083437919617, "step": 9304 }, { "epoch": 0.18612, "grad_norm": 2.078125, "grad_norm_var": 2.5449544270833333, "learning_rate": 0.0001, "loss": 4.2383, "loss/crossentropy": 1.9335210919380188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20264607667922974, "step": 9306 }, { "epoch": 0.18616, "grad_norm": 2.109375, "grad_norm_var": 2.5707194010416665, "learning_rate": 0.0001, "loss": 4.2285, "loss/crossentropy": 1.90863037109375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20882528275251389, "step": 9308 }, { "epoch": 0.1862, "grad_norm": 2.1875, "grad_norm_var": 2.5589996337890626, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 2.1449084281921387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329169511795044, "step": 9310 }, { "epoch": 0.18624, "grad_norm": 2.015625, "grad_norm_var": 2.567341105143229, "learning_rate": 0.0001, "loss": 4.1806, "loss/crossentropy": 1.9571366906166077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148708775639534, "step": 9312 }, { "epoch": 0.18628, "grad_norm": 2.25, "grad_norm_var": 2.542252604166667, "learning_rate": 0.0001, "loss": 4.4691, "loss/crossentropy": 2.174731135368347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280728593468666, "step": 9314 }, { "epoch": 0.18632, "grad_norm": 1.96875, "grad_norm_var": 2.5416575113932294, "learning_rate": 0.0001, "loss": 4.4344, "loss/crossentropy": 1.9569833874702454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855706185102463, "step": 9316 }, { "epoch": 0.18636, "grad_norm": 2.1875, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 4.3162, "loss/crossentropy": 2.153563976287842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22807130962610245, "step": 9318 }, { "epoch": 0.1864, "grad_norm": 2.203125, "grad_norm_var": 0.010741933186848959, "learning_rate": 0.0001, "loss": 4.3028, "loss/crossentropy": 1.9219058752059937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295902967453003, "step": 9320 }, { "epoch": 0.18644, "grad_norm": 1.96875, "grad_norm_var": 0.011926015218098959, "learning_rate": 0.0001, "loss": 4.29, "loss/crossentropy": 2.1993675231933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223759263753891, "step": 9322 }, { "epoch": 0.18648, "grad_norm": 2.078125, "grad_norm_var": 0.011730702718098958, "learning_rate": 0.0001, "loss": 4.2254, "loss/crossentropy": 2.173740863800049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23405525833368301, "step": 9324 }, { "epoch": 0.18652, "grad_norm": 2.25, "grad_norm_var": 0.012564849853515626, "learning_rate": 0.0001, "loss": 4.3717, "loss/crossentropy": 2.026577115058899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265823632478714, "step": 9326 }, { "epoch": 0.18656, "grad_norm": 2.671875, "grad_norm_var": 0.030326080322265626, "learning_rate": 0.0001, "loss": 4.4664, "loss/crossentropy": 2.3629637956619263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23659023642539978, "step": 9328 }, { "epoch": 0.1866, "grad_norm": 2.234375, "grad_norm_var": 0.03050715128580729, "learning_rate": 0.0001, "loss": 4.3245, "loss/crossentropy": 2.100727915763855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468425542116165, "step": 9330 }, { "epoch": 0.18664, "grad_norm": 2.109375, "grad_norm_var": 0.025923411051432293, "learning_rate": 0.0001, "loss": 4.5291, "loss/crossentropy": 2.163568615913391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23402437567710876, "step": 9332 }, { "epoch": 0.18668, "grad_norm": 1.9375, "grad_norm_var": 0.02934748331705729, "learning_rate": 0.0001, "loss": 3.9687, "loss/crossentropy": 1.9579994082450867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19713342934846878, "step": 9334 }, { "epoch": 0.18672, "grad_norm": 1.9609375, "grad_norm_var": 0.029412587483723957, "learning_rate": 0.0001, "loss": 4.2586, "loss/crossentropy": 1.805375874042511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22647518664598465, "step": 9336 }, { "epoch": 0.18676, "grad_norm": 2.171875, "grad_norm_var": 0.02797215779622396, "learning_rate": 0.0001, "loss": 4.2025, "loss/crossentropy": 2.0764458775520325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2270444855093956, "step": 9338 }, { "epoch": 0.1868, "grad_norm": 2.03125, "grad_norm_var": 0.030987294514973958, "learning_rate": 0.0001, "loss": 3.8731, "loss/crossentropy": 1.5654467940330505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15944529324769974, "step": 9340 }, { "epoch": 0.18684, "grad_norm": 2.15625, "grad_norm_var": 0.02976048787434896, "learning_rate": 0.0001, "loss": 4.3831, "loss/crossentropy": 2.007612407207489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061111181974411, "step": 9342 }, { "epoch": 0.18688, "grad_norm": 1.875, "grad_norm_var": 0.010910797119140624, "learning_rate": 0.0001, "loss": 4.0835, "loss/crossentropy": 1.981432855129242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19339826703071594, "step": 9344 }, { "epoch": 0.18692, "grad_norm": 1.9921875, "grad_norm_var": 0.00965576171875, "learning_rate": 0.0001, "loss": 4.1291, "loss/crossentropy": 1.804275631904602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968545839190483, "step": 9346 }, { "epoch": 0.18696, "grad_norm": 2.078125, "grad_norm_var": 0.008763631184895834, "learning_rate": 0.0001, "loss": 4.2109, "loss/crossentropy": 1.9718617796897888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21482623368501663, "step": 9348 }, { "epoch": 0.187, "grad_norm": 2.15625, "grad_norm_var": 0.008397420247395834, "learning_rate": 0.0001, "loss": 4.1672, "loss/crossentropy": 1.8358338475227356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20042669028043747, "step": 9350 }, { "epoch": 0.18704, "grad_norm": 2.15625, "grad_norm_var": 0.009089914957682292, "learning_rate": 0.0001, "loss": 4.2045, "loss/crossentropy": 1.9447709321975708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21207460761070251, "step": 9352 }, { "epoch": 0.18708, "grad_norm": 2.03125, "grad_norm_var": 0.011356353759765625, "learning_rate": 0.0001, "loss": 4.3134, "loss/crossentropy": 1.8921163082122803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19945629686117172, "step": 9354 }, { "epoch": 0.18712, "grad_norm": 2.171875, "grad_norm_var": 0.009580230712890625, "learning_rate": 0.0001, "loss": 4.2018, "loss/crossentropy": 2.1323947310447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2258753925561905, "step": 9356 }, { "epoch": 0.18716, "grad_norm": 2.109375, "grad_norm_var": 0.009277089436848959, "learning_rate": 0.0001, "loss": 4.4278, "loss/crossentropy": 2.064914345741272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23450962454080582, "step": 9358 }, { "epoch": 0.1872, "grad_norm": 2.046875, "grad_norm_var": 0.005783843994140625, "learning_rate": 0.0001, "loss": 4.0173, "loss/crossentropy": 2.1264703273773193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169734537601471, "step": 9360 }, { "epoch": 0.18724, "grad_norm": 2.09375, "grad_norm_var": 0.004784138997395834, "learning_rate": 0.0001, "loss": 4.2926, "loss/crossentropy": 1.7044150233268738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19169463962316513, "step": 9362 }, { "epoch": 0.18728, "grad_norm": 2.0625, "grad_norm_var": 0.004541015625, "learning_rate": 0.0001, "loss": 4.5367, "loss/crossentropy": 1.9475398659706116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22891414165496826, "step": 9364 }, { "epoch": 0.18732, "grad_norm": 1.9140625, "grad_norm_var": 0.008017730712890626, "learning_rate": 0.0001, "loss": 4.1416, "loss/crossentropy": 2.134114623069763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2241017445921898, "step": 9366 }, { "epoch": 0.18736, "grad_norm": 2.1875, "grad_norm_var": 0.007342274983723958, "learning_rate": 0.0001, "loss": 4.2185, "loss/crossentropy": 1.8317620158195496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20189791917800903, "step": 9368 }, { "epoch": 0.1874, "grad_norm": 2.140625, "grad_norm_var": 0.005041249593098958, "learning_rate": 0.0001, "loss": 4.3518, "loss/crossentropy": 2.257538855075836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2343401461839676, "step": 9370 }, { "epoch": 0.18744, "grad_norm": 2.09375, "grad_norm_var": 0.005228424072265625, "learning_rate": 0.0001, "loss": 4.0535, "loss/crossentropy": 2.3722634315490723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23045828938484192, "step": 9372 }, { "epoch": 0.18748, "grad_norm": 2.078125, "grad_norm_var": 0.005222320556640625, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.1880545020103455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345728725194931, "step": 9374 }, { "epoch": 0.18752, "grad_norm": 2.3125, "grad_norm_var": 0.008255767822265624, "learning_rate": 0.0001, "loss": 4.5037, "loss/crossentropy": 1.8883287906646729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25232937932014465, "step": 9376 }, { "epoch": 0.18756, "grad_norm": 2.21875, "grad_norm_var": 0.009124501546223959, "learning_rate": 0.0001, "loss": 4.2963, "loss/crossentropy": 1.9253730773925781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20324261486530304, "step": 9378 }, { "epoch": 0.1876, "grad_norm": 2.078125, "grad_norm_var": 0.009211985270182292, "learning_rate": 0.0001, "loss": 4.2745, "loss/crossentropy": 1.961540937423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22623597085475922, "step": 9380 }, { "epoch": 0.18764, "grad_norm": 2.203125, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 4.347, "loss/crossentropy": 2.190120279788971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24154195934534073, "step": 9382 }, { "epoch": 0.18768, "grad_norm": 2.140625, "grad_norm_var": 0.007445271809895833, "learning_rate": 0.0001, "loss": 4.6159, "loss/crossentropy": 2.0888350009918213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22750811278820038, "step": 9384 }, { "epoch": 0.18772, "grad_norm": 2.1875, "grad_norm_var": 0.008128865559895834, "learning_rate": 0.0001, "loss": 4.3171, "loss/crossentropy": 1.724816918373108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21245518326759338, "step": 9386 }, { "epoch": 0.18776, "grad_norm": 2.0, "grad_norm_var": 0.009307607014973959, "learning_rate": 0.0001, "loss": 4.0868, "loss/crossentropy": 1.6542762517929077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1961463838815689, "step": 9388 }, { "epoch": 0.1878, "grad_norm": 2.078125, "grad_norm_var": 0.009714508056640625, "learning_rate": 0.0001, "loss": 4.4748, "loss/crossentropy": 2.3528761863708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23583710938692093, "step": 9390 }, { "epoch": 0.18784, "grad_norm": 2.03125, "grad_norm_var": 0.009012603759765625, "learning_rate": 0.0001, "loss": 4.394, "loss/crossentropy": 2.181188702583313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24201467633247375, "step": 9392 }, { "epoch": 0.18788, "grad_norm": 2.0625, "grad_norm_var": 0.008414459228515626, "learning_rate": 0.0001, "loss": 4.3476, "loss/crossentropy": 2.280683398246765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22065220028162003, "step": 9394 }, { "epoch": 0.18792, "grad_norm": 2.21875, "grad_norm_var": 0.009275054931640625, "learning_rate": 0.0001, "loss": 4.1603, "loss/crossentropy": 1.9839438199996948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22708184272050858, "step": 9396 }, { "epoch": 0.18796, "grad_norm": 2.125, "grad_norm_var": 0.0070879618326822914, "learning_rate": 0.0001, "loss": 4.4322, "loss/crossentropy": 2.311874270439148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22249652445316315, "step": 9398 }, { "epoch": 0.188, "grad_norm": 2.140625, "grad_norm_var": 0.008503214518229166, "learning_rate": 0.0001, "loss": 4.3808, "loss/crossentropy": 2.430112838745117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23544297367334366, "step": 9400 }, { "epoch": 0.18804, "grad_norm": 1.96875, "grad_norm_var": 0.00892333984375, "learning_rate": 0.0001, "loss": 4.2393, "loss/crossentropy": 2.146397888660431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23211465775966644, "step": 9402 }, { "epoch": 0.18808, "grad_norm": 2.203125, "grad_norm_var": 0.008318837483723958, "learning_rate": 0.0001, "loss": 4.5071, "loss/crossentropy": 2.4402170181274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25594406574964523, "step": 9404 }, { "epoch": 0.18812, "grad_norm": 2.046875, "grad_norm_var": 0.008294423421223959, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 2.141621232032776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156461626291275, "step": 9406 }, { "epoch": 0.18816, "grad_norm": 2.0625, "grad_norm_var": 0.006290435791015625, "learning_rate": 0.0001, "loss": 4.2511, "loss/crossentropy": 2.0266456604003906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20194754749536514, "step": 9408 }, { "epoch": 0.1882, "grad_norm": 1.9765625, "grad_norm_var": 0.007062784830729167, "learning_rate": 0.0001, "loss": 4.2831, "loss/crossentropy": 2.304496645927429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22345459461212158, "step": 9410 }, { "epoch": 0.18824, "grad_norm": 2.25, "grad_norm_var": 0.007523600260416667, "learning_rate": 0.0001, "loss": 4.4085, "loss/crossentropy": 1.8486470580101013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20493299514055252, "step": 9412 }, { "epoch": 0.18828, "grad_norm": 2.09375, "grad_norm_var": 0.0075927734375, "learning_rate": 0.0001, "loss": 4.4562, "loss/crossentropy": 2.0593990683555603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22942744940519333, "step": 9414 }, { "epoch": 0.18832, "grad_norm": 2.15625, "grad_norm_var": 0.006528472900390625, "learning_rate": 0.0001, "loss": 4.1551, "loss/crossentropy": 1.7936111688613892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1776513010263443, "step": 9416 }, { "epoch": 0.18836, "grad_norm": 2.0, "grad_norm_var": 0.005997467041015625, "learning_rate": 0.0001, "loss": 4.4484, "loss/crossentropy": 2.0676616430282593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21748381853103638, "step": 9418 }, { "epoch": 0.1884, "grad_norm": 2.1875, "grad_norm_var": 0.005236562093098958, "learning_rate": 0.0001, "loss": 4.3415, "loss/crossentropy": 2.0317665934562683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215682566165924, "step": 9420 }, { "epoch": 0.18844, "grad_norm": 2.234375, "grad_norm_var": 0.006359608968098959, "learning_rate": 0.0001, "loss": 4.5623, "loss/crossentropy": 2.3345483541488647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24850556254386902, "step": 9422 }, { "epoch": 0.18848, "grad_norm": 2.03125, "grad_norm_var": 0.007043202718098958, "learning_rate": 0.0001, "loss": 4.3526, "loss/crossentropy": 2.0603779554367065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20997442305088043, "step": 9424 }, { "epoch": 0.18852, "grad_norm": 2.15625, "grad_norm_var": 0.006591796875, "learning_rate": 0.0001, "loss": 4.4944, "loss/crossentropy": 1.9450209140777588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21465806663036346, "step": 9426 }, { "epoch": 0.18856, "grad_norm": 2.046875, "grad_norm_var": 0.0065266927083333336, "learning_rate": 0.0001, "loss": 4.2452, "loss/crossentropy": 2.3568087816238403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24619507789611816, "step": 9428 }, { "epoch": 0.1886, "grad_norm": 2.25, "grad_norm_var": 0.007828776041666667, "learning_rate": 0.0001, "loss": 4.1976, "loss/crossentropy": 2.2047020196914673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519628629088402, "step": 9430 }, { "epoch": 0.18864, "grad_norm": 1.9609375, "grad_norm_var": 0.010188547770182292, "learning_rate": 0.0001, "loss": 4.3168, "loss/crossentropy": 2.1594117879867554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21274243295192719, "step": 9432 }, { "epoch": 0.18868, "grad_norm": 2.25, "grad_norm_var": 0.009936269124348958, "learning_rate": 0.0001, "loss": 4.3847, "loss/crossentropy": 2.120336890220642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2092270776629448, "step": 9434 }, { "epoch": 0.18872, "grad_norm": 2.015625, "grad_norm_var": 0.011120351155598958, "learning_rate": 0.0001, "loss": 4.2725, "loss/crossentropy": 2.13198459148407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.238224595785141, "step": 9436 }, { "epoch": 0.18876, "grad_norm": 2.09375, "grad_norm_var": 0.010009511311848959, "learning_rate": 0.0001, "loss": 4.3706, "loss/crossentropy": 1.9252395629882812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22254322469234467, "step": 9438 }, { "epoch": 0.1888, "grad_norm": 2.09375, "grad_norm_var": 0.0123291015625, "learning_rate": 0.0001, "loss": 4.0574, "loss/crossentropy": 2.123266577720642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22276867926120758, "step": 9440 }, { "epoch": 0.18884, "grad_norm": 2.140625, "grad_norm_var": 0.010741170247395833, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 1.9219747185707092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18884174525737762, "step": 9442 }, { "epoch": 0.18888, "grad_norm": 2.125, "grad_norm_var": 0.010587565104166667, "learning_rate": 0.0001, "loss": 4.2603, "loss/crossentropy": 2.0122207403182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20913395285606384, "step": 9444 }, { "epoch": 0.18892, "grad_norm": 1.953125, "grad_norm_var": 0.010632069905598958, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 2.0255361199378967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982431635260582, "step": 9446 }, { "epoch": 0.18896, "grad_norm": 1.9921875, "grad_norm_var": 0.008770497639973958, "learning_rate": 0.0001, "loss": 4.4085, "loss/crossentropy": 1.8060500025749207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19787351042032242, "step": 9448 }, { "epoch": 0.189, "grad_norm": 2.171875, "grad_norm_var": 0.007389068603515625, "learning_rate": 0.0001, "loss": 4.3799, "loss/crossentropy": 2.340656042098999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22321298718452454, "step": 9450 }, { "epoch": 0.18904, "grad_norm": 1.9296875, "grad_norm_var": 0.00966796875, "learning_rate": 0.0001, "loss": 4.3973, "loss/crossentropy": 2.35786235332489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356308028101921, "step": 9452 }, { "epoch": 0.18908, "grad_norm": 2.0625, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.2622, "loss/crossentropy": 2.344806671142578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21194154769182205, "step": 9454 }, { "epoch": 0.18912, "grad_norm": 2.046875, "grad_norm_var": 0.008786773681640625, "learning_rate": 0.0001, "loss": 4.4861, "loss/crossentropy": 2.2449493408203125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21038557589054108, "step": 9456 }, { "epoch": 0.18916, "grad_norm": 1.8515625, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 4.0343, "loss/crossentropy": 1.977162778377533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20586547255516052, "step": 9458 }, { "epoch": 0.1892, "grad_norm": 2.203125, "grad_norm_var": 0.015860748291015626, "learning_rate": 0.0001, "loss": 4.3453, "loss/crossentropy": 2.0941065549850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2197253629565239, "step": 9460 }, { "epoch": 0.18924, "grad_norm": 2.03125, "grad_norm_var": 0.0143463134765625, "learning_rate": 0.0001, "loss": 4.3479, "loss/crossentropy": 2.5145565271377563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25138507783412933, "step": 9462 }, { "epoch": 0.18928, "grad_norm": 2.0, "grad_norm_var": 0.014289347330729167, "learning_rate": 0.0001, "loss": 4.4076, "loss/crossentropy": 2.2870718240737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2298167496919632, "step": 9464 }, { "epoch": 0.18932, "grad_norm": 2.109375, "grad_norm_var": 0.013678995768229167, "learning_rate": 0.0001, "loss": 4.3588, "loss/crossentropy": 2.2095978260040283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217079259455204, "step": 9466 }, { "epoch": 0.18936, "grad_norm": 2.140625, "grad_norm_var": 0.011726633707682291, "learning_rate": 0.0001, "loss": 4.3735, "loss/crossentropy": 1.9591819047927856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21825183182954788, "step": 9468 }, { "epoch": 0.1894, "grad_norm": 2.1875, "grad_norm_var": 0.011352284749348959, "learning_rate": 0.0001, "loss": 4.5072, "loss/crossentropy": 2.266845226287842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24172072112560272, "step": 9470 }, { "epoch": 0.18944, "grad_norm": 2.203125, "grad_norm_var": 0.011437733968098959, "learning_rate": 0.0001, "loss": 4.3448, "loss/crossentropy": 2.0732688903808594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23074156790971756, "step": 9472 }, { "epoch": 0.18948, "grad_norm": 2.34375, "grad_norm_var": 0.0090728759765625, "learning_rate": 0.0001, "loss": 4.6697, "loss/crossentropy": 1.935340702533722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194545865058899, "step": 9474 }, { "epoch": 0.18952, "grad_norm": 2.125, "grad_norm_var": 0.007594553629557291, "learning_rate": 0.0001, "loss": 4.1518, "loss/crossentropy": 1.795669674873352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18794939666986465, "step": 9476 }, { "epoch": 0.18956, "grad_norm": 2.109375, "grad_norm_var": 0.008841705322265626, "learning_rate": 0.0001, "loss": 3.7745, "loss/crossentropy": 1.9292446970939636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20147182047367096, "step": 9478 }, { "epoch": 0.1896, "grad_norm": 2.09375, "grad_norm_var": 0.0067779541015625, "learning_rate": 0.0001, "loss": 4.1955, "loss/crossentropy": 2.0110061168670654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096811756491661, "step": 9480 }, { "epoch": 0.18964, "grad_norm": 1.9921875, "grad_norm_var": 0.008034006754557291, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 2.1543468236923218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142205834388733, "step": 9482 }, { "epoch": 0.18968, "grad_norm": 2.046875, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 4.0126, "loss/crossentropy": 2.0860520601272583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21716032177209854, "step": 9484 }, { "epoch": 0.18972, "grad_norm": 1.9921875, "grad_norm_var": 0.010241444905598958, "learning_rate": 0.0001, "loss": 3.9412, "loss/crossentropy": 1.7190409302711487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19397014379501343, "step": 9486 }, { "epoch": 0.18976, "grad_norm": 2.234375, "grad_norm_var": 0.017116038004557292, "learning_rate": 0.0001, "loss": 4.4711, "loss/crossentropy": 2.178081512451172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280048429965973, "step": 9488 }, { "epoch": 0.1898, "grad_norm": 2.109375, "grad_norm_var": 0.013063303629557292, "learning_rate": 0.0001, "loss": 4.267, "loss/crossentropy": 2.0749863982200623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22755203396081924, "step": 9490 }, { "epoch": 0.18984, "grad_norm": 2.09375, "grad_norm_var": 0.014098866780598959, "learning_rate": 0.0001, "loss": 4.3147, "loss/crossentropy": 2.214204430580139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23071825504302979, "step": 9492 }, { "epoch": 0.18988, "grad_norm": 2.09375, "grad_norm_var": 0.015794881184895835, "learning_rate": 0.0001, "loss": 4.0041, "loss/crossentropy": 1.6625414490699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857389286160469, "step": 9494 }, { "epoch": 0.18992, "grad_norm": 2.375, "grad_norm_var": 0.020539347330729166, "learning_rate": 0.0001, "loss": 4.4474, "loss/crossentropy": 1.8537201285362244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21718977391719818, "step": 9496 }, { "epoch": 0.18996, "grad_norm": 1.84375, "grad_norm_var": 0.024074045817057292, "learning_rate": 0.0001, "loss": 4.0488, "loss/crossentropy": 1.716725468635559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18544895946979523, "step": 9498 }, { "epoch": 0.19, "grad_norm": 1.9453125, "grad_norm_var": 0.024192047119140626, "learning_rate": 0.0001, "loss": 4.1035, "loss/crossentropy": 1.94467431306839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210828959941864, "step": 9500 }, { "epoch": 0.19004, "grad_norm": 2.015625, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 4.2383, "loss/crossentropy": 1.9377062320709229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22141354531049728, "step": 9502 }, { "epoch": 0.19008, "grad_norm": 2.0, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 4.3717, "loss/crossentropy": 2.2015358209609985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22363336384296417, "step": 9504 }, { "epoch": 0.19012, "grad_norm": 2.109375, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 4.2332, "loss/crossentropy": 2.373024582862854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2402234748005867, "step": 9506 }, { "epoch": 0.19016, "grad_norm": 2.140625, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 4.0519, "loss/crossentropy": 2.1573110222816467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24207139760255814, "step": 9508 }, { "epoch": 0.1902, "grad_norm": 2.203125, "grad_norm_var": 0.014788564046223958, "learning_rate": 0.0001, "loss": 4.4041, "loss/crossentropy": 2.324455976486206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24618541449308395, "step": 9510 }, { "epoch": 0.19024, "grad_norm": 2.078125, "grad_norm_var": 0.010603586832682291, "learning_rate": 0.0001, "loss": 4.0065, "loss/crossentropy": 1.7480111718177795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049119919538498, "step": 9512 }, { "epoch": 0.19028, "grad_norm": 2.125, "grad_norm_var": 0.006300608317057292, "learning_rate": 0.0001, "loss": 4.3644, "loss/crossentropy": 1.9925439953804016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314591035246849, "step": 9514 }, { "epoch": 0.19032, "grad_norm": 2.140625, "grad_norm_var": 0.007094065348307292, "learning_rate": 0.0001, "loss": 4.0702, "loss/crossentropy": 2.133601188659668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329796403646469, "step": 9516 }, { "epoch": 0.19036, "grad_norm": 1.984375, "grad_norm_var": 0.010198720296223958, "learning_rate": 0.0001, "loss": 4.246, "loss/crossentropy": 2.093464970588684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22636859863996506, "step": 9518 }, { "epoch": 0.1904, "grad_norm": 2.0625, "grad_norm_var": 0.009757232666015626, "learning_rate": 0.0001, "loss": 4.4766, "loss/crossentropy": 2.6137614250183105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27037859708070755, "step": 9520 }, { "epoch": 0.19044, "grad_norm": 2.125, "grad_norm_var": 0.012300364176432292, "learning_rate": 0.0001, "loss": 4.5499, "loss/crossentropy": 2.008640229701996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21647297590970993, "step": 9522 }, { "epoch": 0.19048, "grad_norm": 2.078125, "grad_norm_var": 0.012286122639973958, "learning_rate": 0.0001, "loss": 4.2467, "loss/crossentropy": 2.12644362449646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23370585590600967, "step": 9524 }, { "epoch": 0.19052, "grad_norm": 2.109375, "grad_norm_var": 0.011425526936848958, "learning_rate": 0.0001, "loss": 4.5409, "loss/crossentropy": 2.2338638305664062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23637598007917404, "step": 9526 }, { "epoch": 0.19056, "grad_norm": 2.0, "grad_norm_var": 0.011785634358723958, "learning_rate": 0.0001, "loss": 4.1405, "loss/crossentropy": 2.0632832646369934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20682457089424133, "step": 9528 }, { "epoch": 0.1906, "grad_norm": 1.9921875, "grad_norm_var": 0.012726847330729167, "learning_rate": 0.0001, "loss": 4.4058, "loss/crossentropy": 2.219120740890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22551076859235764, "step": 9530 }, { "epoch": 0.19064, "grad_norm": 1.953125, "grad_norm_var": 0.011860911051432292, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 2.4633371829986572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24491076171398163, "step": 9532 }, { "epoch": 0.19068, "grad_norm": 2.03125, "grad_norm_var": 0.008211008707682292, "learning_rate": 0.0001, "loss": 4.2152, "loss/crossentropy": 2.2408339977264404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22265981882810593, "step": 9534 }, { "epoch": 0.19072, "grad_norm": 2.234375, "grad_norm_var": 0.009544881184895833, "learning_rate": 0.0001, "loss": 4.326, "loss/crossentropy": 1.9779353141784668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159332111477852, "step": 9536 }, { "epoch": 0.19076, "grad_norm": 2.265625, "grad_norm_var": 0.0082275390625, "learning_rate": 0.0001, "loss": 4.3522, "loss/crossentropy": 2.0315812826156616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20710154622793198, "step": 9538 }, { "epoch": 0.1908, "grad_norm": 2.078125, "grad_norm_var": 0.011787923177083333, "learning_rate": 0.0001, "loss": 4.18, "loss/crossentropy": 2.085337817668915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22174393385648727, "step": 9540 }, { "epoch": 0.19084, "grad_norm": 2.03125, "grad_norm_var": 0.016507975260416665, "learning_rate": 0.0001, "loss": 4.4386, "loss/crossentropy": 2.3449169397354126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3135230466723442, "step": 9542 }, { "epoch": 0.19088, "grad_norm": 2.046875, "grad_norm_var": 0.016169230143229168, "learning_rate": 0.0001, "loss": 4.3943, "loss/crossentropy": 2.1083431243896484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21395261585712433, "step": 9544 }, { "epoch": 0.19092, "grad_norm": 2.03125, "grad_norm_var": 0.015730539957682293, "learning_rate": 0.0001, "loss": 4.2091, "loss/crossentropy": 2.0386710166931152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22487390786409378, "step": 9546 }, { "epoch": 0.19096, "grad_norm": 2.21875, "grad_norm_var": 0.015240224202473958, "learning_rate": 0.0001, "loss": 4.2377, "loss/crossentropy": 1.9533087611198425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223353162407875, "step": 9548 }, { "epoch": 0.191, "grad_norm": 2.21875, "grad_norm_var": 0.015317535400390625, "learning_rate": 0.0001, "loss": 4.4183, "loss/crossentropy": 2.35861599445343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26162558794021606, "step": 9550 }, { "epoch": 0.19104, "grad_norm": 2.015625, "grad_norm_var": 0.014388020833333333, "learning_rate": 0.0001, "loss": 4.2507, "loss/crossentropy": 1.9529705047607422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125585675239563, "step": 9552 }, { "epoch": 0.19108, "grad_norm": 2.109375, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 4.2041, "loss/crossentropy": 2.0364453196525574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21163037419319153, "step": 9554 }, { "epoch": 0.19112, "grad_norm": 2.078125, "grad_norm_var": 0.009749348958333333, "learning_rate": 0.0001, "loss": 4.2758, "loss/crossentropy": 2.1321409940719604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24250106513500214, "step": 9556 }, { "epoch": 0.19116, "grad_norm": 2.125, "grad_norm_var": 0.0073931376139322914, "learning_rate": 0.0001, "loss": 4.1848, "loss/crossentropy": 2.024011969566345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20132286846637726, "step": 9558 }, { "epoch": 0.1912, "grad_norm": 2.109375, "grad_norm_var": 0.006980133056640625, "learning_rate": 0.0001, "loss": 4.3726, "loss/crossentropy": 2.106776535511017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23114337772130966, "step": 9560 }, { "epoch": 0.19124, "grad_norm": 2.265625, "grad_norm_var": 0.007645416259765625, "learning_rate": 0.0001, "loss": 4.4642, "loss/crossentropy": 1.9228236079216003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426098585128784, "step": 9562 }, { "epoch": 0.19128, "grad_norm": 2.171875, "grad_norm_var": 0.008090972900390625, "learning_rate": 0.0001, "loss": 4.2384, "loss/crossentropy": 2.3033370971679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20113499462604523, "step": 9564 }, { "epoch": 0.19132, "grad_norm": 2.125, "grad_norm_var": 0.008973948160807292, "learning_rate": 0.0001, "loss": 4.0736, "loss/crossentropy": 1.6983963251113892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19003060460090637, "step": 9566 }, { "epoch": 0.19136, "grad_norm": 2.34375, "grad_norm_var": 0.011295318603515625, "learning_rate": 0.0001, "loss": 4.5283, "loss/crossentropy": 2.502004861831665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24949797987937927, "step": 9568 }, { "epoch": 0.1914, "grad_norm": 2.03125, "grad_norm_var": 0.011793772379557291, "learning_rate": 0.0001, "loss": 4.2004, "loss/crossentropy": 1.9981504678726196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152494639158249, "step": 9570 }, { "epoch": 0.19144, "grad_norm": 1.96875, "grad_norm_var": 0.012931060791015626, "learning_rate": 0.0001, "loss": 4.1211, "loss/crossentropy": 2.1489784717559814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22105325013399124, "step": 9572 }, { "epoch": 0.19148, "grad_norm": 1.9453125, "grad_norm_var": 0.013016510009765624, "learning_rate": 0.0001, "loss": 4.2234, "loss/crossentropy": 1.974421203136444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23960395902395248, "step": 9574 }, { "epoch": 0.19152, "grad_norm": 2.09375, "grad_norm_var": 0.012670644124348958, "learning_rate": 0.0001, "loss": 4.4119, "loss/crossentropy": 2.379599928855896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26731471717357635, "step": 9576 }, { "epoch": 0.19156, "grad_norm": 2.015625, "grad_norm_var": 0.012444814046223959, "learning_rate": 0.0001, "loss": 4.3718, "loss/crossentropy": 2.26211154460907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22957566380500793, "step": 9578 }, { "epoch": 0.1916, "grad_norm": 2.125, "grad_norm_var": 0.011871083577473959, "learning_rate": 0.0001, "loss": 4.4594, "loss/crossentropy": 2.282869577407837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2404445931315422, "step": 9580 }, { "epoch": 0.19164, "grad_norm": 2.03125, "grad_norm_var": 0.010457102457682292, "learning_rate": 0.0001, "loss": 4.1592, "loss/crossentropy": 1.9183810949325562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18672315031290054, "step": 9582 }, { "epoch": 0.19168, "grad_norm": 2.046875, "grad_norm_var": 0.0076812744140625, "learning_rate": 0.0001, "loss": 4.1801, "loss/crossentropy": 2.2722173929214478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23262400180101395, "step": 9584 }, { "epoch": 0.19172, "grad_norm": 2.0625, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 4.4032, "loss/crossentropy": 2.426178455352783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23461100459098816, "step": 9586 }, { "epoch": 0.19176, "grad_norm": 2.09375, "grad_norm_var": 0.00687255859375, "learning_rate": 0.0001, "loss": 4.344, "loss/crossentropy": 2.2266165018081665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22530251741409302, "step": 9588 }, { "epoch": 0.1918, "grad_norm": 2.078125, "grad_norm_var": 0.005541737874348958, "learning_rate": 0.0001, "loss": 4.1633, "loss/crossentropy": 1.786275327205658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19257958233356476, "step": 9590 }, { "epoch": 0.19184, "grad_norm": 2.0, "grad_norm_var": 0.008868153889973958, "learning_rate": 0.0001, "loss": 4.2367, "loss/crossentropy": 1.8497431874275208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21965742111206055, "step": 9592 }, { "epoch": 0.19188, "grad_norm": 2.078125, "grad_norm_var": 0.008143870035807292, "learning_rate": 0.0001, "loss": 4.5612, "loss/crossentropy": 2.492846131324768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21851783990859985, "step": 9594 }, { "epoch": 0.19192, "grad_norm": 2.28125, "grad_norm_var": 0.010027821858723958, "learning_rate": 0.0001, "loss": 4.6838, "loss/crossentropy": 2.3447986841201782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26567359268665314, "step": 9596 }, { "epoch": 0.19196, "grad_norm": 1.984375, "grad_norm_var": 0.010654449462890625, "learning_rate": 0.0001, "loss": 4.1513, "loss/crossentropy": 1.656063199043274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903422325849533, "step": 9598 }, { "epoch": 0.192, "grad_norm": 2.125, "grad_norm_var": 0.008329264322916667, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.1816134452819824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23314762860536575, "step": 9600 }, { "epoch": 0.19204, "grad_norm": 2.015625, "grad_norm_var": 0.010007476806640625, "learning_rate": 0.0001, "loss": 3.8537, "loss/crossentropy": 1.787261426448822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20938758552074432, "step": 9602 }, { "epoch": 0.19208, "grad_norm": 2.015625, "grad_norm_var": 0.010526275634765625, "learning_rate": 0.0001, "loss": 4.2374, "loss/crossentropy": 1.9722678065299988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23085469752550125, "step": 9604 }, { "epoch": 0.19212, "grad_norm": 1.9453125, "grad_norm_var": 0.012113444010416667, "learning_rate": 0.0001, "loss": 4.0954, "loss/crossentropy": 2.04559987783432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21647220849990845, "step": 9606 }, { "epoch": 0.19216, "grad_norm": 2.140625, "grad_norm_var": 0.008934529622395833, "learning_rate": 0.0001, "loss": 4.5397, "loss/crossentropy": 2.5426105260849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23680832237005234, "step": 9608 }, { "epoch": 0.1922, "grad_norm": 2.0, "grad_norm_var": 0.008890787760416666, "learning_rate": 0.0001, "loss": 4.245, "loss/crossentropy": 2.1339075565338135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22501112520694733, "step": 9610 }, { "epoch": 0.19224, "grad_norm": 2.03125, "grad_norm_var": 0.006550852457682292, "learning_rate": 0.0001, "loss": 4.0924, "loss/crossentropy": 1.9554831981658936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20187357813119888, "step": 9612 }, { "epoch": 0.19228, "grad_norm": 2.203125, "grad_norm_var": 0.009065500895182292, "learning_rate": 0.0001, "loss": 4.5578, "loss/crossentropy": 2.2996249198913574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24954287707805634, "step": 9614 }, { "epoch": 0.19232, "grad_norm": 2.125, "grad_norm_var": 0.009065500895182292, "learning_rate": 0.0001, "loss": 4.3753, "loss/crossentropy": 2.2439414262771606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22833245247602463, "step": 9616 }, { "epoch": 0.19236, "grad_norm": 2.046875, "grad_norm_var": 0.007933553059895833, "learning_rate": 0.0001, "loss": 4.2297, "loss/crossentropy": 1.8900890946388245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21077623218297958, "step": 9618 }, { "epoch": 0.1924, "grad_norm": 2.03125, "grad_norm_var": 0.007804361979166666, "learning_rate": 0.0001, "loss": 4.287, "loss/crossentropy": 2.110726058483124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21897974610328674, "step": 9620 }, { "epoch": 0.19244, "grad_norm": 1.9765625, "grad_norm_var": 0.0080474853515625, "learning_rate": 0.0001, "loss": 4.0135, "loss/crossentropy": 1.7363090515136719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19939633458852768, "step": 9622 }, { "epoch": 0.19248, "grad_norm": 2.109375, "grad_norm_var": 0.01014404296875, "learning_rate": 0.0001, "loss": 4.3643, "loss/crossentropy": 1.8148014545440674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21637701243162155, "step": 9624 }, { "epoch": 0.19252, "grad_norm": 2.03125, "grad_norm_var": 0.0092041015625, "learning_rate": 0.0001, "loss": 4.2451, "loss/crossentropy": 2.2895134687423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2297188639640808, "step": 9626 }, { "epoch": 0.19256, "grad_norm": 2.15625, "grad_norm_var": 0.006296539306640625, "learning_rate": 0.0001, "loss": 4.4778, "loss/crossentropy": 2.117924213409424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24057473242282867, "step": 9628 }, { "epoch": 0.1926, "grad_norm": 2.140625, "grad_norm_var": 0.005421702067057292, "learning_rate": 0.0001, "loss": 4.2791, "loss/crossentropy": 1.8709319829940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106107696890831, "step": 9630 }, { "epoch": 0.19264, "grad_norm": 1.9140625, "grad_norm_var": 0.0079742431640625, "learning_rate": 0.0001, "loss": 4.0174, "loss/crossentropy": 1.5156871676445007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17487400770187378, "step": 9632 }, { "epoch": 0.19268, "grad_norm": 2.1875, "grad_norm_var": 0.0183837890625, "learning_rate": 0.0001, "loss": 4.6281, "loss/crossentropy": 2.153126537799835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22009101510047913, "step": 9634 }, { "epoch": 0.19272, "grad_norm": 2.09375, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 4.443, "loss/crossentropy": 2.0890414714813232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233077436685562, "step": 9636 }, { "epoch": 0.19276, "grad_norm": 2.015625, "grad_norm_var": 0.01858495076497396, "learning_rate": 0.0001, "loss": 4.2572, "loss/crossentropy": 2.1218496561050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22576671838760376, "step": 9638 }, { "epoch": 0.1928, "grad_norm": 1.921875, "grad_norm_var": 0.020918528238932293, "learning_rate": 0.0001, "loss": 4.2522, "loss/crossentropy": 2.1131649017333984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21846124529838562, "step": 9640 }, { "epoch": 0.19284, "grad_norm": 2.1875, "grad_norm_var": 0.02067845662434896, "learning_rate": 0.0001, "loss": 4.6643, "loss/crossentropy": 2.3941714763641357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.246211439371109, "step": 9642 }, { "epoch": 0.19288, "grad_norm": 1.890625, "grad_norm_var": 0.02687352498372396, "learning_rate": 0.0001, "loss": 4.0641, "loss/crossentropy": 1.9579638838768005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198220357298851, "step": 9644 }, { "epoch": 0.19292, "grad_norm": 2.140625, "grad_norm_var": 0.02754491170247396, "learning_rate": 0.0001, "loss": 4.564, "loss/crossentropy": 2.388027787208557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23349716514348984, "step": 9646 }, { "epoch": 0.19296, "grad_norm": 2.15625, "grad_norm_var": 0.02520726521809896, "learning_rate": 0.0001, "loss": 4.3678, "loss/crossentropy": 1.8203087449073792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18923642486333847, "step": 9648 }, { "epoch": 0.193, "grad_norm": 2.03125, "grad_norm_var": 0.015476226806640625, "learning_rate": 0.0001, "loss": 4.2119, "loss/crossentropy": 1.996269702911377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096453383564949, "step": 9650 }, { "epoch": 0.19304, "grad_norm": 2.09375, "grad_norm_var": 0.01572240193684896, "learning_rate": 0.0001, "loss": 4.2921, "loss/crossentropy": 1.806606113910675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19824761897325516, "step": 9652 }, { "epoch": 0.19308, "grad_norm": 2.28125, "grad_norm_var": 0.01651178995768229, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.190830111503601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22319861501455307, "step": 9654 }, { "epoch": 0.19312, "grad_norm": 2.15625, "grad_norm_var": 0.014048004150390625, "learning_rate": 0.0001, "loss": 4.4579, "loss/crossentropy": 1.9721493124961853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011292800307274, "step": 9656 }, { "epoch": 0.19316, "grad_norm": 2.234375, "grad_norm_var": 0.014277903238932292, "learning_rate": 0.0001, "loss": 4.5673, "loss/crossentropy": 2.1256929636001587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2728194147348404, "step": 9658 }, { "epoch": 0.1932, "grad_norm": 2.125, "grad_norm_var": 0.008345286051432291, "learning_rate": 0.0001, "loss": 4.3206, "loss/crossentropy": 2.090156316757202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240355908870697, "step": 9660 }, { "epoch": 0.19324, "grad_norm": 1.9140625, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.0231, "loss/crossentropy": 2.2930272817611694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23188824206590652, "step": 9662 }, { "epoch": 0.19328, "grad_norm": 1.9453125, "grad_norm_var": 0.0122222900390625, "learning_rate": 0.0001, "loss": 4.0075, "loss/crossentropy": 1.9754068851470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698863685131073, "step": 9664 }, { "epoch": 0.19332, "grad_norm": 2.09375, "grad_norm_var": 0.0118560791015625, "learning_rate": 0.0001, "loss": 4.1668, "loss/crossentropy": 1.8987788558006287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531752288341522, "step": 9666 }, { "epoch": 0.19336, "grad_norm": 2.0625, "grad_norm_var": 0.011678059895833334, "learning_rate": 0.0001, "loss": 4.3185, "loss/crossentropy": 2.2699583768844604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2572309076786041, "step": 9668 }, { "epoch": 0.1934, "grad_norm": 2.0625, "grad_norm_var": 0.0072509765625, "learning_rate": 0.0001, "loss": 4.3267, "loss/crossentropy": 1.6649349927902222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23388104140758514, "step": 9670 }, { "epoch": 0.19344, "grad_norm": 2.171875, "grad_norm_var": 0.007258097330729167, "learning_rate": 0.0001, "loss": 4.3846, "loss/crossentropy": 2.173617362976074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20848755538463593, "step": 9672 }, { "epoch": 0.19348, "grad_norm": 2.03125, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.2690787315368652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23493453860282898, "step": 9674 }, { "epoch": 0.19352, "grad_norm": 1.9375, "grad_norm_var": 0.007079060872395833, "learning_rate": 0.0001, "loss": 4.3621, "loss/crossentropy": 2.00560861825943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22538188099861145, "step": 9676 }, { "epoch": 0.19356, "grad_norm": 2.0625, "grad_norm_var": 0.004369862874348958, "learning_rate": 0.0001, "loss": 4.1264, "loss/crossentropy": 1.960120975971222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21056914329528809, "step": 9678 }, { "epoch": 0.1936, "grad_norm": 1.9921875, "grad_norm_var": 0.004689280192057292, "learning_rate": 0.0001, "loss": 3.9868, "loss/crossentropy": 1.8921862840652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22250327467918396, "step": 9680 }, { "epoch": 0.19364, "grad_norm": 2.0625, "grad_norm_var": 0.004839833577473958, "learning_rate": 0.0001, "loss": 4.296, "loss/crossentropy": 1.9474233984947205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19719959795475006, "step": 9682 }, { "epoch": 0.19368, "grad_norm": 2.21875, "grad_norm_var": 0.0072100321451822914, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 2.3391844034194946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22263485193252563, "step": 9684 }, { "epoch": 0.19372, "grad_norm": 2.21875, "grad_norm_var": 0.008715565999348958, "learning_rate": 0.0001, "loss": 4.3641, "loss/crossentropy": 2.190012037754059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531607002019882, "step": 9686 }, { "epoch": 0.19376, "grad_norm": 2.15625, "grad_norm_var": 0.008283487955729167, "learning_rate": 0.0001, "loss": 4.2601, "loss/crossentropy": 1.9935640096664429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438511461019516, "step": 9688 }, { "epoch": 0.1938, "grad_norm": 2.15625, "grad_norm_var": 0.009186808268229167, "learning_rate": 0.0001, "loss": 4.1756, "loss/crossentropy": 1.7482191324234009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20972990244627, "step": 9690 }, { "epoch": 0.19384, "grad_norm": 2.125, "grad_norm_var": 0.008353678385416667, "learning_rate": 0.0001, "loss": 4.475, "loss/crossentropy": 2.2413275241851807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24103231728076935, "step": 9692 }, { "epoch": 0.19388, "grad_norm": 2.140625, "grad_norm_var": 0.008519490559895834, "learning_rate": 0.0001, "loss": 4.2886, "loss/crossentropy": 2.2846572399139404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2420315518975258, "step": 9694 }, { "epoch": 0.19392, "grad_norm": 2.265625, "grad_norm_var": 0.007968902587890625, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 1.7977504134178162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20145538449287415, "step": 9696 }, { "epoch": 0.19396, "grad_norm": 2.0625, "grad_norm_var": 0.007535552978515625, "learning_rate": 0.0001, "loss": 4.205, "loss/crossentropy": 2.0343902111053467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21595563739538193, "step": 9698 }, { "epoch": 0.194, "grad_norm": 2.125, "grad_norm_var": 0.0052487691243489586, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 1.8828233480453491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19624938070774078, "step": 9700 }, { "epoch": 0.19404, "grad_norm": 1.984375, "grad_norm_var": 0.005602773030598958, "learning_rate": 0.0001, "loss": 4.1569, "loss/crossentropy": 1.9177573323249817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19467243552207947, "step": 9702 }, { "epoch": 0.19408, "grad_norm": 1.9296875, "grad_norm_var": 0.006613922119140625, "learning_rate": 0.0001, "loss": 3.8881, "loss/crossentropy": 1.8025588393211365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18757501989603043, "step": 9704 }, { "epoch": 0.19412, "grad_norm": 2.140625, "grad_norm_var": 0.006723785400390625, "learning_rate": 0.0001, "loss": 4.3195, "loss/crossentropy": 2.025633454322815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21623078733682632, "step": 9706 }, { "epoch": 0.19416, "grad_norm": 2.109375, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 3.8244, "loss/crossentropy": 1.9421055316925049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20563311874866486, "step": 9708 }, { "epoch": 0.1942, "grad_norm": 2.09375, "grad_norm_var": 0.009191640218098958, "learning_rate": 0.0001, "loss": 4.0977, "loss/crossentropy": 1.9820671081542969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22029083967208862, "step": 9710 }, { "epoch": 0.19424, "grad_norm": 2.328125, "grad_norm_var": 0.010341135660807292, "learning_rate": 0.0001, "loss": 4.2037, "loss/crossentropy": 1.9491158723831177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19559209793806076, "step": 9712 }, { "epoch": 0.19428, "grad_norm": 2.0625, "grad_norm_var": 0.010416412353515625, "learning_rate": 0.0001, "loss": 4.1592, "loss/crossentropy": 1.8653306365013123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18629660457372665, "step": 9714 }, { "epoch": 0.19432, "grad_norm": 1.953125, "grad_norm_var": 0.01950251261393229, "learning_rate": 0.0001, "loss": 4.2567, "loss/crossentropy": 1.6897491812705994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948539912700653, "step": 9716 }, { "epoch": 0.19436, "grad_norm": 2.203125, "grad_norm_var": 0.020776112874348957, "learning_rate": 0.0001, "loss": 4.4387, "loss/crossentropy": 2.07690966129303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147115021944046, "step": 9718 }, { "epoch": 0.1944, "grad_norm": 2.03125, "grad_norm_var": 0.020189412434895835, "learning_rate": 0.0001, "loss": 4.269, "loss/crossentropy": 2.0356597304344177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20626354217529297, "step": 9720 }, { "epoch": 0.19444, "grad_norm": 2.0625, "grad_norm_var": 0.0204254150390625, "learning_rate": 0.0001, "loss": 4.0548, "loss/crossentropy": 1.7709991931915283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18573015183210373, "step": 9722 }, { "epoch": 0.19448, "grad_norm": 1.984375, "grad_norm_var": 0.01830012003580729, "learning_rate": 0.0001, "loss": 4.117, "loss/crossentropy": 2.0255925059318542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22114800661802292, "step": 9724 }, { "epoch": 0.19452, "grad_norm": 2.25, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 4.4655, "loss/crossentropy": 2.0046772956848145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22649522870779037, "step": 9726 }, { "epoch": 0.19456, "grad_norm": 2.15625, "grad_norm_var": 0.015827433268229166, "learning_rate": 0.0001, "loss": 4.179, "loss/crossentropy": 2.2746634483337402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23017344623804092, "step": 9728 }, { "epoch": 0.1946, "grad_norm": 2.078125, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 4.203, "loss/crossentropy": 2.0191025137901306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20670472085475922, "step": 9730 }, { "epoch": 0.19464, "grad_norm": 2.015625, "grad_norm_var": 0.008177693684895833, "learning_rate": 0.0001, "loss": 4.1657, "loss/crossentropy": 2.3198455572128296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24985665082931519, "step": 9732 }, { "epoch": 0.19468, "grad_norm": 2.09375, "grad_norm_var": 0.0073150634765625, "learning_rate": 0.0001, "loss": 4.3886, "loss/crossentropy": 2.5229711532592773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22920701652765274, "step": 9734 }, { "epoch": 0.19472, "grad_norm": 2.078125, "grad_norm_var": 0.0072662353515625, "learning_rate": 0.0001, "loss": 3.9991, "loss/crossentropy": 2.081319808959961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21542686223983765, "step": 9736 }, { "epoch": 0.19476, "grad_norm": 2.046875, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 4.0698, "loss/crossentropy": 2.2170007824897766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21852095425128937, "step": 9738 }, { "epoch": 0.1948, "grad_norm": 1.9453125, "grad_norm_var": 0.008265940348307292, "learning_rate": 0.0001, "loss": 4.1528, "loss/crossentropy": 1.830683708190918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910252571105957, "step": 9740 }, { "epoch": 0.19484, "grad_norm": 2.25, "grad_norm_var": 0.008420562744140625, "learning_rate": 0.0001, "loss": 4.3543, "loss/crossentropy": 2.1303864121437073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.228670135140419, "step": 9742 }, { "epoch": 0.19488, "grad_norm": 2.015625, "grad_norm_var": 0.007486724853515625, "learning_rate": 0.0001, "loss": 4.1504, "loss/crossentropy": 2.2621915340423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22027206420898438, "step": 9744 }, { "epoch": 0.19492, "grad_norm": 2.0, "grad_norm_var": 0.0110260009765625, "learning_rate": 0.0001, "loss": 3.9564, "loss/crossentropy": 2.044555902481079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20469766855239868, "step": 9746 }, { "epoch": 0.19496, "grad_norm": 2.109375, "grad_norm_var": 0.0115142822265625, "learning_rate": 0.0001, "loss": 4.1677, "loss/crossentropy": 1.911176860332489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22036674618721008, "step": 9748 }, { "epoch": 0.195, "grad_norm": 1.96875, "grad_norm_var": 0.010545857747395833, "learning_rate": 0.0001, "loss": 4.0904, "loss/crossentropy": 1.8650219440460205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19878911972045898, "step": 9750 }, { "epoch": 0.19504, "grad_norm": 2.125, "grad_norm_var": 0.02088623046875, "learning_rate": 0.0001, "loss": 4.3648, "loss/crossentropy": 2.196391463279724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20845064520835876, "step": 9752 }, { "epoch": 0.19508, "grad_norm": 1.9921875, "grad_norm_var": 0.021201324462890626, "learning_rate": 0.0001, "loss": 4.0132, "loss/crossentropy": 2.222484588623047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144618257880211, "step": 9754 }, { "epoch": 0.19512, "grad_norm": 2.15625, "grad_norm_var": 0.020685831705729168, "learning_rate": 0.0001, "loss": 4.4078, "loss/crossentropy": 2.416514754295349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370949387550354, "step": 9756 }, { "epoch": 0.19516, "grad_norm": 1.9765625, "grad_norm_var": 0.020566558837890624, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 2.22190260887146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23128122836351395, "step": 9758 }, { "epoch": 0.1952, "grad_norm": 2.15625, "grad_norm_var": 0.024008941650390626, "learning_rate": 0.0001, "loss": 4.4253, "loss/crossentropy": 2.446492910385132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23962965607643127, "step": 9760 }, { "epoch": 0.19524, "grad_norm": 2.015625, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 4.1676, "loss/crossentropy": 2.1458136439323425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19657295942306519, "step": 9762 }, { "epoch": 0.19528, "grad_norm": 2.203125, "grad_norm_var": 0.0197418212890625, "learning_rate": 0.0001, "loss": 4.6029, "loss/crossentropy": 1.9773340225219727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22887050360441208, "step": 9764 }, { "epoch": 0.19532, "grad_norm": 1.9453125, "grad_norm_var": 0.020334625244140626, "learning_rate": 0.0001, "loss": 4.1583, "loss/crossentropy": 2.0938061475753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19823112338781357, "step": 9766 }, { "epoch": 0.19536, "grad_norm": 2.3125, "grad_norm_var": 0.015636952718098958, "learning_rate": 0.0001, "loss": 4.2628, "loss/crossentropy": 1.9068174958229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19924984872341156, "step": 9768 }, { "epoch": 0.1954, "grad_norm": 2.109375, "grad_norm_var": 0.012422688802083333, "learning_rate": 0.0001, "loss": 4.3567, "loss/crossentropy": 2.045413613319397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19175175577402115, "step": 9770 }, { "epoch": 0.19544, "grad_norm": 2.09375, "grad_norm_var": 0.013106282552083333, "learning_rate": 0.0001, "loss": 4.1972, "loss/crossentropy": 2.0262961983680725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22137057781219482, "step": 9772 }, { "epoch": 0.19548, "grad_norm": 2.0625, "grad_norm_var": 0.012992350260416667, "learning_rate": 0.0001, "loss": 4.0624, "loss/crossentropy": 2.146475672721863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2284562587738037, "step": 9774 }, { "epoch": 0.19552, "grad_norm": 2.015625, "grad_norm_var": 0.011885579427083333, "learning_rate": 0.0001, "loss": 4.154, "loss/crossentropy": 2.0316100120544434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26275022327899933, "step": 9776 }, { "epoch": 0.19556, "grad_norm": 2.140625, "grad_norm_var": 0.011359659830729167, "learning_rate": 0.0001, "loss": 4.5424, "loss/crossentropy": 2.1386696100234985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22695952653884888, "step": 9778 }, { "epoch": 0.1956, "grad_norm": 2.578125, "grad_norm_var": 0.024405924479166667, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 2.3000820875167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24045731872320175, "step": 9780 }, { "epoch": 0.19564, "grad_norm": 2.3125, "grad_norm_var": 0.02415949503580729, "learning_rate": 0.0001, "loss": 4.7778, "loss/crossentropy": 2.1272310614585876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2318389192223549, "step": 9782 }, { "epoch": 0.19568, "grad_norm": 1.9453125, "grad_norm_var": 0.025655110677083332, "learning_rate": 0.0001, "loss": 4.1232, "loss/crossentropy": 1.8953965306282043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213637076318264, "step": 9784 }, { "epoch": 0.19572, "grad_norm": 2.0625, "grad_norm_var": 0.0261871337890625, "learning_rate": 0.0001, "loss": 4.3459, "loss/crossentropy": 2.0738734006881714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100028172135353, "step": 9786 }, { "epoch": 0.19576, "grad_norm": 2.0, "grad_norm_var": 0.027378082275390625, "learning_rate": 0.0001, "loss": 4.2599, "loss/crossentropy": 1.9454593658447266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557409197092056, "step": 9788 }, { "epoch": 0.1958, "grad_norm": 2.015625, "grad_norm_var": 0.026569620768229166, "learning_rate": 0.0001, "loss": 4.1655, "loss/crossentropy": 2.101949095726013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22479471564292908, "step": 9790 }, { "epoch": 0.19584, "grad_norm": 2.25, "grad_norm_var": 0.025935872395833334, "learning_rate": 0.0001, "loss": 4.3841, "loss/crossentropy": 1.9524416327476501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574256777763367, "step": 9792 }, { "epoch": 0.19588, "grad_norm": 2.296875, "grad_norm_var": 0.06084391276041667, "learning_rate": 0.0001, "loss": 4.1596, "loss/crossentropy": 1.9490719437599182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19411193579435349, "step": 9794 }, { "epoch": 0.19592, "grad_norm": 2.046875, "grad_norm_var": 0.050675455729166666, "learning_rate": 0.0001, "loss": 4.3438, "loss/crossentropy": 2.095793664455414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21078093349933624, "step": 9796 }, { "epoch": 0.19596, "grad_norm": 2.015625, "grad_norm_var": 0.0508056640625, "learning_rate": 0.0001, "loss": 4.0828, "loss/crossentropy": 2.032066822052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181444078683853, "step": 9798 }, { "epoch": 0.196, "grad_norm": 2.140625, "grad_norm_var": 0.048378245035807295, "learning_rate": 0.0001, "loss": 4.0019, "loss/crossentropy": 2.1378380060195923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224056214094162, "step": 9800 }, { "epoch": 0.19604, "grad_norm": 2.1875, "grad_norm_var": 0.04784520467122396, "learning_rate": 0.0001, "loss": 4.2932, "loss/crossentropy": 2.120614767074585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104548141360283, "step": 9802 }, { "epoch": 0.19608, "grad_norm": 1.8984375, "grad_norm_var": 0.048954010009765625, "learning_rate": 0.0001, "loss": 4.1975, "loss/crossentropy": 1.8837141394615173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19755827635526657, "step": 9804 }, { "epoch": 0.19612, "grad_norm": 2.03125, "grad_norm_var": 0.04793675740559896, "learning_rate": 0.0001, "loss": 4.1645, "loss/crossentropy": 1.9325945973396301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005266472697258, "step": 9806 }, { "epoch": 0.19616, "grad_norm": 2.046875, "grad_norm_var": 0.04839045206705729, "learning_rate": 0.0001, "loss": 4.2211, "loss/crossentropy": 1.789370834827423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20060084015130997, "step": 9808 }, { "epoch": 0.1962, "grad_norm": 1.96875, "grad_norm_var": 0.01709162394205729, "learning_rate": 0.0001, "loss": 4.2463, "loss/crossentropy": 1.9807876348495483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21805762499570847, "step": 9810 }, { "epoch": 0.19624, "grad_norm": 1.9375, "grad_norm_var": 0.018888092041015624, "learning_rate": 0.0001, "loss": 4.1339, "loss/crossentropy": 2.210070848464966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23082506656646729, "step": 9812 }, { "epoch": 0.19628, "grad_norm": 2.0625, "grad_norm_var": 0.01693903605143229, "learning_rate": 0.0001, "loss": 4.1633, "loss/crossentropy": 1.8644117712974548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20300551503896713, "step": 9814 }, { "epoch": 0.19632, "grad_norm": 2.125, "grad_norm_var": 0.01634496053059896, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 1.6804233193397522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20438820868730545, "step": 9816 }, { "epoch": 0.19636, "grad_norm": 2.09375, "grad_norm_var": 0.01757990519205729, "learning_rate": 0.0001, "loss": 4.4339, "loss/crossentropy": 2.2513452768325806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24517202377319336, "step": 9818 }, { "epoch": 0.1964, "grad_norm": 2.203125, "grad_norm_var": 0.01683527628580729, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 2.3133161067962646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22146832942962646, "step": 9820 }, { "epoch": 0.19644, "grad_norm": 2.1875, "grad_norm_var": 0.016949208577473958, "learning_rate": 0.0001, "loss": 4.5003, "loss/crossentropy": 2.3226611614227295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327018678188324, "step": 9822 }, { "epoch": 0.19648, "grad_norm": 2.078125, "grad_norm_var": 0.016228993733723957, "learning_rate": 0.0001, "loss": 4.1832, "loss/crossentropy": 2.234626054763794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23657850921154022, "step": 9824 }, { "epoch": 0.19652, "grad_norm": 2.25, "grad_norm_var": 0.008941396077473959, "learning_rate": 0.0001, "loss": 4.4576, "loss/crossentropy": 2.3478844165802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24796272069215775, "step": 9826 }, { "epoch": 0.19656, "grad_norm": 2.171875, "grad_norm_var": 0.006982167561848958, "learning_rate": 0.0001, "loss": 4.3678, "loss/crossentropy": 2.094748795032501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21118396520614624, "step": 9828 }, { "epoch": 0.1966, "grad_norm": 2.140625, "grad_norm_var": 0.007289377848307291, "learning_rate": 0.0001, "loss": 4.2916, "loss/crossentropy": 2.166300058364868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773284673690796, "step": 9830 }, { "epoch": 0.19664, "grad_norm": 2.09375, "grad_norm_var": 0.007212066650390625, "learning_rate": 0.0001, "loss": 4.5253, "loss/crossentropy": 2.198317289352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22636590898036957, "step": 9832 }, { "epoch": 0.19668, "grad_norm": 2.1875, "grad_norm_var": 0.006278228759765625, "learning_rate": 0.0001, "loss": 4.3405, "loss/crossentropy": 2.3081597089767456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231675922870636, "step": 9834 }, { "epoch": 0.19672, "grad_norm": 1.9921875, "grad_norm_var": 0.005206044514973958, "learning_rate": 0.0001, "loss": 4.1181, "loss/crossentropy": 2.048017203807831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20089948922395706, "step": 9836 }, { "epoch": 0.19676, "grad_norm": 2.125, "grad_norm_var": 0.01768773396809896, "learning_rate": 0.0001, "loss": 4.0515, "loss/crossentropy": 1.886117160320282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21702590584754944, "step": 9838 }, { "epoch": 0.1968, "grad_norm": 2.0625, "grad_norm_var": 0.017895253499348958, "learning_rate": 0.0001, "loss": 3.9827, "loss/crossentropy": 1.902605414390564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976041465997696, "step": 9840 }, { "epoch": 0.19684, "grad_norm": 4.28125, "grad_norm_var": 0.309179433186849, "learning_rate": 0.0001, "loss": 4.0874, "loss/crossentropy": 1.7959995865821838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023170217871666, "step": 9842 }, { "epoch": 0.19688, "grad_norm": 2.15625, "grad_norm_var": 0.3066993713378906, "learning_rate": 0.0001, "loss": 4.5535, "loss/crossentropy": 2.1481738090515137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22133169323205948, "step": 9844 }, { "epoch": 0.19692, "grad_norm": 2.03125, "grad_norm_var": 0.30752741495768227, "learning_rate": 0.0001, "loss": 4.0269, "loss/crossentropy": 1.9328197240829468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20346572250127792, "step": 9846 }, { "epoch": 0.19696, "grad_norm": 2.015625, "grad_norm_var": 0.31202367146809895, "learning_rate": 0.0001, "loss": 4.0299, "loss/crossentropy": 2.096457004547119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21341010928153992, "step": 9848 }, { "epoch": 0.197, "grad_norm": 2.03125, "grad_norm_var": 0.3146522521972656, "learning_rate": 0.0001, "loss": 4.3105, "loss/crossentropy": 1.9698969721794128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1855439990758896, "step": 9850 }, { "epoch": 0.19704, "grad_norm": 2.234375, "grad_norm_var": 0.31038004557291665, "learning_rate": 0.0001, "loss": 4.2165, "loss/crossentropy": 1.987346351146698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23657751083374023, "step": 9852 }, { "epoch": 0.19708, "grad_norm": 1.8984375, "grad_norm_var": 0.3109169006347656, "learning_rate": 0.0001, "loss": 4.0346, "loss/crossentropy": 1.9535572528839111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21131790429353714, "step": 9854 }, { "epoch": 0.19712, "grad_norm": 2.09375, "grad_norm_var": 0.3090349833170573, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 2.08541601896286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2290833741426468, "step": 9856 }, { "epoch": 0.19716, "grad_norm": 2.28125, "grad_norm_var": 0.016155751546223958, "learning_rate": 0.0001, "loss": 4.2774, "loss/crossentropy": 2.1930192708969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23072391748428345, "step": 9858 }, { "epoch": 0.1972, "grad_norm": 2.359375, "grad_norm_var": 0.016658274332682292, "learning_rate": 0.0001, "loss": 4.3923, "loss/crossentropy": 1.8369358777999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20687467604875565, "step": 9860 }, { "epoch": 0.19724, "grad_norm": 2.0, "grad_norm_var": 0.017380523681640624, "learning_rate": 0.0001, "loss": 4.0287, "loss/crossentropy": 1.6707186102867126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1808091625571251, "step": 9862 }, { "epoch": 0.19728, "grad_norm": 1.984375, "grad_norm_var": 0.017651112874348958, "learning_rate": 0.0001, "loss": 4.1184, "loss/crossentropy": 1.8352991342544556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19169726222753525, "step": 9864 }, { "epoch": 0.19732, "grad_norm": 2.734375, "grad_norm_var": 0.042909495035807294, "learning_rate": 0.0001, "loss": 4.2857, "loss/crossentropy": 1.9427489638328552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20076656341552734, "step": 9866 }, { "epoch": 0.19736, "grad_norm": 2.078125, "grad_norm_var": 0.04237035115559896, "learning_rate": 0.0001, "loss": 4.3725, "loss/crossentropy": 2.2190250158309937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24385111033916473, "step": 9868 }, { "epoch": 0.1974, "grad_norm": 1.9453125, "grad_norm_var": 0.04182510375976563, "learning_rate": 0.0001, "loss": 4.2192, "loss/crossentropy": 2.0958147644996643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23264098167419434, "step": 9870 }, { "epoch": 0.19744, "grad_norm": 2.25, "grad_norm_var": 0.04228897094726562, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.542472720146179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23458892852067947, "step": 9872 }, { "epoch": 0.19748, "grad_norm": 2.21875, "grad_norm_var": 0.03728408813476562, "learning_rate": 0.0001, "loss": 4.2767, "loss/crossentropy": 2.1562893390655518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243279591202736, "step": 9874 }, { "epoch": 0.19752, "grad_norm": 2.21875, "grad_norm_var": 0.034395090738932294, "learning_rate": 0.0001, "loss": 4.0114, "loss/crossentropy": 1.9851951599121094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2369420975446701, "step": 9876 }, { "epoch": 0.19756, "grad_norm": 2.03125, "grad_norm_var": 0.032714589436848955, "learning_rate": 0.0001, "loss": 4.3017, "loss/crossentropy": 1.8751549124717712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777788758277893, "step": 9878 }, { "epoch": 0.1976, "grad_norm": 2.125, "grad_norm_var": 0.0318267822265625, "learning_rate": 0.0001, "loss": 4.1328, "loss/crossentropy": 2.010055720806122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21204733848571777, "step": 9880 }, { "epoch": 0.19764, "grad_norm": 2.0625, "grad_norm_var": 0.008942667643229167, "learning_rate": 0.0001, "loss": 4.1122, "loss/crossentropy": 2.04589307308197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21330490708351135, "step": 9882 }, { "epoch": 0.19768, "grad_norm": 2.125, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 4.1663, "loss/crossentropy": 1.9441133737564087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22589778900146484, "step": 9884 }, { "epoch": 0.19772, "grad_norm": 2.25, "grad_norm_var": 0.008296457926432292, "learning_rate": 0.0001, "loss": 4.4239, "loss/crossentropy": 2.305395483970642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24267761409282684, "step": 9886 }, { "epoch": 0.19776, "grad_norm": 2.03125, "grad_norm_var": 0.007269032796223958, "learning_rate": 0.0001, "loss": 4.386, "loss/crossentropy": 2.096014082431793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21358592063188553, "step": 9888 }, { "epoch": 0.1978, "grad_norm": 2.25, "grad_norm_var": 0.008149973551432292, "learning_rate": 0.0001, "loss": 4.2315, "loss/crossentropy": 2.1115033626556396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22994756698608398, "step": 9890 }, { "epoch": 0.19784, "grad_norm": 1.9375, "grad_norm_var": 0.008654530843098958, "learning_rate": 0.0001, "loss": 4.1419, "loss/crossentropy": 1.6122692227363586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897209882736206, "step": 9892 }, { "epoch": 0.19788, "grad_norm": 2.0625, "grad_norm_var": 0.012933095296223959, "learning_rate": 0.0001, "loss": 4.1679, "loss/crossentropy": 1.4790136218070984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17421242594718933, "step": 9894 }, { "epoch": 0.19792, "grad_norm": 2.09375, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 4.2921, "loss/crossentropy": 1.8394885063171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20840360969305038, "step": 9896 }, { "epoch": 0.19796, "grad_norm": 2.03125, "grad_norm_var": 0.0135894775390625, "learning_rate": 0.0001, "loss": 4.3993, "loss/crossentropy": 2.152444541454315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23834071308374405, "step": 9898 }, { "epoch": 0.198, "grad_norm": 2.1875, "grad_norm_var": 0.012189737955729167, "learning_rate": 0.0001, "loss": 4.5268, "loss/crossentropy": 2.367082357406616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23409543931484222, "step": 9900 }, { "epoch": 0.19804, "grad_norm": 2.125, "grad_norm_var": 0.011799112955729166, "learning_rate": 0.0001, "loss": 4.4599, "loss/crossentropy": 2.136048436164856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21797242760658264, "step": 9902 }, { "epoch": 0.19808, "grad_norm": 2.03125, "grad_norm_var": 0.011945597330729167, "learning_rate": 0.0001, "loss": 4.2262, "loss/crossentropy": 2.035883128643036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215388223528862, "step": 9904 }, { "epoch": 0.19812, "grad_norm": 2.203125, "grad_norm_var": 0.011970774332682291, "learning_rate": 0.0001, "loss": 4.2024, "loss/crossentropy": 2.0339369773864746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085876688361168, "step": 9906 }, { "epoch": 0.19816, "grad_norm": 2.0625, "grad_norm_var": 0.009655507405598958, "learning_rate": 0.0001, "loss": 4.4441, "loss/crossentropy": 1.829429566860199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20085029304027557, "step": 9908 }, { "epoch": 0.1982, "grad_norm": 1.984375, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.1108, "loss/crossentropy": 2.1721774339675903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118668630719185, "step": 9910 }, { "epoch": 0.19824, "grad_norm": 2.265625, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.3511, "loss/crossentropy": 1.958261251449585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22304313629865646, "step": 9912 }, { "epoch": 0.19828, "grad_norm": 2.21875, "grad_norm_var": 0.009870402018229167, "learning_rate": 0.0001, "loss": 4.5819, "loss/crossentropy": 2.4651763439178467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435118407011032, "step": 9914 }, { "epoch": 0.19832, "grad_norm": 1.9140625, "grad_norm_var": 0.012532297770182292, "learning_rate": 0.0001, "loss": 4.0973, "loss/crossentropy": 2.173910617828369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21142201125621796, "step": 9916 }, { "epoch": 0.19836, "grad_norm": 2.078125, "grad_norm_var": 0.013239542643229166, "learning_rate": 0.0001, "loss": 3.9283, "loss/crossentropy": 1.9559763073921204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19051063805818558, "step": 9918 }, { "epoch": 0.1984, "grad_norm": 2.265625, "grad_norm_var": 0.015895334879557292, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 2.238003969192505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2338731735944748, "step": 9920 }, { "epoch": 0.19844, "grad_norm": 2.140625, "grad_norm_var": 0.015233357747395834, "learning_rate": 0.0001, "loss": 4.4624, "loss/crossentropy": 2.269726276397705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19430368393659592, "step": 9922 }, { "epoch": 0.19848, "grad_norm": 2.109375, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0001, "loss": 4.436, "loss/crossentropy": 2.351833701133728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24469508230686188, "step": 9924 }, { "epoch": 0.19852, "grad_norm": 2.15625, "grad_norm_var": 0.013398996988932292, "learning_rate": 0.0001, "loss": 4.3833, "loss/crossentropy": 2.0600146055221558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21375566720962524, "step": 9926 }, { "epoch": 0.19856, "grad_norm": 2.015625, "grad_norm_var": 0.012562815348307292, "learning_rate": 0.0001, "loss": 4.2957, "loss/crossentropy": 2.1543694734573364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916736871004105, "step": 9928 }, { "epoch": 0.1986, "grad_norm": 2.015625, "grad_norm_var": 0.010782623291015625, "learning_rate": 0.0001, "loss": 4.3111, "loss/crossentropy": 1.8952317833900452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216978445649147, "step": 9930 }, { "epoch": 0.19864, "grad_norm": 2.03125, "grad_norm_var": 0.008695475260416667, "learning_rate": 0.0001, "loss": 4.0542, "loss/crossentropy": 2.102243661880493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22314947098493576, "step": 9932 }, { "epoch": 0.19868, "grad_norm": 1.9921875, "grad_norm_var": 0.0072021484375, "learning_rate": 0.0001, "loss": 4.1653, "loss/crossentropy": 1.9036884307861328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19729873538017273, "step": 9934 }, { "epoch": 0.19872, "grad_norm": 2.0, "grad_norm_var": 0.006459299723307292, "learning_rate": 0.0001, "loss": 4.17, "loss/crossentropy": 1.823796033859253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18089265376329422, "step": 9936 }, { "epoch": 0.19876, "grad_norm": 2.0, "grad_norm_var": 0.006302642822265625, "learning_rate": 0.0001, "loss": 3.8742, "loss/crossentropy": 2.055707633495331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20620816200971603, "step": 9938 }, { "epoch": 0.1988, "grad_norm": 2.109375, "grad_norm_var": 0.0055010477701822914, "learning_rate": 0.0001, "loss": 4.256, "loss/crossentropy": 2.0490049719810486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21709006279706955, "step": 9940 }, { "epoch": 0.19884, "grad_norm": 2.0625, "grad_norm_var": 0.005891672770182292, "learning_rate": 0.0001, "loss": 4.2733, "loss/crossentropy": 2.164198637008667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335330694913864, "step": 9942 }, { "epoch": 0.19888, "grad_norm": 1.9609375, "grad_norm_var": 0.006941731770833333, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 1.9218478202819824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945241093635559, "step": 9944 }, { "epoch": 0.19892, "grad_norm": 2.109375, "grad_norm_var": 0.007328033447265625, "learning_rate": 0.0001, "loss": 3.8591, "loss/crossentropy": 1.9456552267074585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20599794387817383, "step": 9946 }, { "epoch": 0.19896, "grad_norm": 1.96875, "grad_norm_var": 0.007749176025390625, "learning_rate": 0.0001, "loss": 4.0385, "loss/crossentropy": 2.0472273230552673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20920252054929733, "step": 9948 }, { "epoch": 0.199, "grad_norm": 1.96875, "grad_norm_var": 0.007860310872395833, "learning_rate": 0.0001, "loss": 4.308, "loss/crossentropy": 2.2252047061920166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360471710562706, "step": 9950 }, { "epoch": 0.19904, "grad_norm": 2.09375, "grad_norm_var": 0.005537923177083333, "learning_rate": 0.0001, "loss": 4.0828, "loss/crossentropy": 1.646431565284729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1692601889371872, "step": 9952 }, { "epoch": 0.19908, "grad_norm": 2.046875, "grad_norm_var": 0.0054443359375, "learning_rate": 0.0001, "loss": 3.8818, "loss/crossentropy": 2.0114784836769104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20070793479681015, "step": 9954 }, { "epoch": 0.19912, "grad_norm": 1.953125, "grad_norm_var": 0.00677490234375, "learning_rate": 0.0001, "loss": 4.324, "loss/crossentropy": 2.001778781414032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21009314060211182, "step": 9956 }, { "epoch": 0.19916, "grad_norm": 2.0, "grad_norm_var": 0.006037394205729167, "learning_rate": 0.0001, "loss": 4.3493, "loss/crossentropy": 2.1330565214157104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22442802786827087, "step": 9958 }, { "epoch": 0.1992, "grad_norm": 1.9921875, "grad_norm_var": 0.005635579427083333, "learning_rate": 0.0001, "loss": 4.193, "loss/crossentropy": 1.9146793484687805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21145610511302948, "step": 9960 }, { "epoch": 0.19924, "grad_norm": 2.015625, "grad_norm_var": 0.004705556233723958, "learning_rate": 0.0001, "loss": 4.3262, "loss/crossentropy": 2.5224483013153076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23354032635688782, "step": 9962 }, { "epoch": 0.19928, "grad_norm": 2.109375, "grad_norm_var": 0.004552968343098958, "learning_rate": 0.0001, "loss": 4.3663, "loss/crossentropy": 2.245160937309265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21848157793283463, "step": 9964 }, { "epoch": 0.19932, "grad_norm": 2.15625, "grad_norm_var": 0.004622141520182292, "learning_rate": 0.0001, "loss": 4.3506, "loss/crossentropy": 2.122299015522003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21853189170360565, "step": 9966 }, { "epoch": 0.19936, "grad_norm": 2.171875, "grad_norm_var": 0.005716705322265625, "learning_rate": 0.0001, "loss": 4.4524, "loss/crossentropy": 2.31084668636322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23691194504499435, "step": 9968 }, { "epoch": 0.1994, "grad_norm": 2.25, "grad_norm_var": 0.007834625244140626, "learning_rate": 0.0001, "loss": 4.274, "loss/crossentropy": 2.2242307662963867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21957046538591385, "step": 9970 }, { "epoch": 0.19944, "grad_norm": 2.078125, "grad_norm_var": 0.007063547770182292, "learning_rate": 0.0001, "loss": 4.2264, "loss/crossentropy": 1.792852759361267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19996580481529236, "step": 9972 }, { "epoch": 0.19948, "grad_norm": 2.09375, "grad_norm_var": 0.0063250223795572914, "learning_rate": 0.0001, "loss": 4.3029, "loss/crossentropy": 1.9593411087989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21858739852905273, "step": 9974 }, { "epoch": 0.19952, "grad_norm": 1.9609375, "grad_norm_var": 0.0067827860514322914, "learning_rate": 0.0001, "loss": 4.3067, "loss/crossentropy": 2.259618401527405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22094043344259262, "step": 9976 }, { "epoch": 0.19956, "grad_norm": 2.140625, "grad_norm_var": 0.0063168843587239586, "learning_rate": 0.0001, "loss": 4.373, "loss/crossentropy": 2.31876802444458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22025877982378006, "step": 9978 }, { "epoch": 0.1996, "grad_norm": 2.0625, "grad_norm_var": 0.006109364827473958, "learning_rate": 0.0001, "loss": 4.15, "loss/crossentropy": 1.7625555396080017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21313250809907913, "step": 9980 }, { "epoch": 0.19964, "grad_norm": 2.265625, "grad_norm_var": 0.007342274983723958, "learning_rate": 0.0001, "loss": 4.4628, "loss/crossentropy": 2.19295072555542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23694515973329544, "step": 9982 }, { "epoch": 0.19968, "grad_norm": 2.0625, "grad_norm_var": 0.008739217122395834, "learning_rate": 0.0001, "loss": 4.187, "loss/crossentropy": 2.4073877334594727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22179137915372849, "step": 9984 }, { "epoch": 0.19972, "grad_norm": 2.015625, "grad_norm_var": 0.008125813802083333, "learning_rate": 0.0001, "loss": 3.9804, "loss/crossentropy": 1.8942558765411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20588286221027374, "step": 9986 }, { "epoch": 0.19976, "grad_norm": 1.9375, "grad_norm_var": 0.012188466389973958, "learning_rate": 0.0001, "loss": 4.109, "loss/crossentropy": 2.220746397972107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144280970096588, "step": 9988 }, { "epoch": 0.1998, "grad_norm": 2.125, "grad_norm_var": 0.01260986328125, "learning_rate": 0.0001, "loss": 4.2279, "loss/crossentropy": 1.9511706233024597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998429372906685, "step": 9990 }, { "epoch": 0.19984, "grad_norm": 2.125, "grad_norm_var": 0.012247467041015625, "learning_rate": 0.0001, "loss": 4.143, "loss/crossentropy": 2.249726891517639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24011528491973877, "step": 9992 }, { "epoch": 0.19988, "grad_norm": 2.6875, "grad_norm_var": 0.03792292277018229, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 1.957375943660736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24464774131774902, "step": 9994 }, { "epoch": 0.19992, "grad_norm": 2.21875, "grad_norm_var": 0.039033762613932294, "learning_rate": 0.0001, "loss": 4.5492, "loss/crossentropy": 2.265984058380127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24291887879371643, "step": 9996 }, { "epoch": 0.19996, "grad_norm": 2.046875, "grad_norm_var": 0.03920873006184896, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 1.8064668774604797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21571090072393417, "step": 9998 }, { "epoch": 0.2, "grad_norm": 2.234375, "grad_norm_var": 0.03875732421875, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 1.9072380661964417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19980185478925705, "step": 10000 }, { "epoch": 0.20004, "grad_norm": 2.15625, "grad_norm_var": 0.036641438802083336, "learning_rate": 0.0001, "loss": 4.3919, "loss/crossentropy": 2.074933707714081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22161328792572021, "step": 10002 }, { "epoch": 0.20008, "grad_norm": 2.015625, "grad_norm_var": 0.027581532796223957, "learning_rate": 0.0001, "loss": 4.0447, "loss/crossentropy": 1.9344687461853027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1825423538684845, "step": 10004 }, { "epoch": 0.20012, "grad_norm": 3.640625, "grad_norm_var": 0.162158203125, "learning_rate": 0.0001, "loss": 4.1116, "loss/crossentropy": 1.866003930568695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22012518346309662, "step": 10006 }, { "epoch": 0.20016, "grad_norm": 2.0, "grad_norm_var": 0.162158203125, "learning_rate": 0.0001, "loss": 4.3517, "loss/crossentropy": 2.145058751106262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22737383097410202, "step": 10008 }, { "epoch": 0.2002, "grad_norm": 2.09375, "grad_norm_var": 0.15038960774739582, "learning_rate": 0.0001, "loss": 3.9755, "loss/crossentropy": 1.7828176617622375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22082456946372986, "step": 10010 }, { "epoch": 0.20024, "grad_norm": 2.109375, "grad_norm_var": 0.1537994384765625, "learning_rate": 0.0001, "loss": 4.3143, "loss/crossentropy": 2.1222537755966187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2176828756928444, "step": 10012 }, { "epoch": 0.20028, "grad_norm": 2.203125, "grad_norm_var": 0.15230712890625, "learning_rate": 0.0001, "loss": 4.53, "loss/crossentropy": 2.119267463684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24002012610435486, "step": 10014 }, { "epoch": 0.20032, "grad_norm": 2.078125, "grad_norm_var": 0.15458577473958332, "learning_rate": 0.0001, "loss": 4.3528, "loss/crossentropy": 2.198129415512085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23767977952957153, "step": 10016 }, { "epoch": 0.20036, "grad_norm": 2.125, "grad_norm_var": 0.15608317057291668, "learning_rate": 0.0001, "loss": 4.0228, "loss/crossentropy": 1.7466872334480286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19916100800037384, "step": 10018 }, { "epoch": 0.2004, "grad_norm": 1.9296875, "grad_norm_var": 0.15812352498372395, "learning_rate": 0.0001, "loss": 4.1989, "loss/crossentropy": 1.9834936261177063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891954466700554, "step": 10020 }, { "epoch": 0.20044, "grad_norm": 1.9921875, "grad_norm_var": 0.005840810139973959, "learning_rate": 0.0001, "loss": 4.0987, "loss/crossentropy": 1.8997412323951721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19624030590057373, "step": 10022 }, { "epoch": 0.20048, "grad_norm": 2.1875, "grad_norm_var": 0.0065915425618489586, "learning_rate": 0.0001, "loss": 4.2531, "loss/crossentropy": 2.0051563382148743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069488987326622, "step": 10024 }, { "epoch": 0.20052, "grad_norm": 2.265625, "grad_norm_var": 0.008107248942057292, "learning_rate": 0.0001, "loss": 4.4215, "loss/crossentropy": 1.8662462830543518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053019255399704, "step": 10026 }, { "epoch": 0.20056, "grad_norm": 2.171875, "grad_norm_var": 0.02585627237955729, "learning_rate": 0.0001, "loss": 4.1444, "loss/crossentropy": 2.1043936014175415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22167576104402542, "step": 10028 }, { "epoch": 0.2006, "grad_norm": 5.78125, "grad_norm_var": 0.859185536702474, "learning_rate": 0.0001, "loss": 4.3685, "loss/crossentropy": 2.1248152256011963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22400003671646118, "step": 10030 }, { "epoch": 0.20064, "grad_norm": 2.078125, "grad_norm_var": 0.852441151936849, "learning_rate": 0.0001, "loss": 4.1971, "loss/crossentropy": 2.326894521713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24597454071044922, "step": 10032 }, { "epoch": 0.20068, "grad_norm": 2.203125, "grad_norm_var": 0.8413164774576823, "learning_rate": 0.0001, "loss": 4.1801, "loss/crossentropy": 1.8590435981750488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20270948112010956, "step": 10034 }, { "epoch": 0.20072, "grad_norm": 2.0, "grad_norm_var": 0.82972412109375, "learning_rate": 0.0001, "loss": 4.5098, "loss/crossentropy": 2.0383604168891907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21504472196102142, "step": 10036 }, { "epoch": 0.20076, "grad_norm": 2.078125, "grad_norm_var": 0.81461181640625, "learning_rate": 0.0001, "loss": 4.599, "loss/crossentropy": 2.42622447013855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2143569439649582, "step": 10038 }, { "epoch": 0.2008, "grad_norm": 2.109375, "grad_norm_var": 0.825640614827474, "learning_rate": 0.0001, "loss": 4.3216, "loss/crossentropy": 2.0825703144073486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21215181052684784, "step": 10040 }, { "epoch": 0.20084, "grad_norm": 2.140625, "grad_norm_var": 0.8326515197753906, "learning_rate": 0.0001, "loss": 3.9615, "loss/crossentropy": 2.1682112216949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538567870855331, "step": 10042 }, { "epoch": 0.20088, "grad_norm": 1.984375, "grad_norm_var": 0.8466957092285157, "learning_rate": 0.0001, "loss": 4.2244, "loss/crossentropy": 2.0161439180374146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21078374981880188, "step": 10044 }, { "epoch": 0.20092, "grad_norm": 2.078125, "grad_norm_var": 0.018143463134765624, "learning_rate": 0.0001, "loss": 4.5491, "loss/crossentropy": 2.2262184619903564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2855340391397476, "step": 10046 }, { "epoch": 0.20096, "grad_norm": 2.203125, "grad_norm_var": 0.011321767171223959, "learning_rate": 0.0001, "loss": 4.5078, "loss/crossentropy": 2.454360246658325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2287754938006401, "step": 10048 }, { "epoch": 0.201, "grad_norm": 2.015625, "grad_norm_var": 0.007940419514973958, "learning_rate": 0.0001, "loss": 4.1146, "loss/crossentropy": 1.970844566822052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20197579264640808, "step": 10050 }, { "epoch": 0.20104, "grad_norm": 2.03125, "grad_norm_var": 0.006211090087890625, "learning_rate": 0.0001, "loss": 4.3573, "loss/crossentropy": 2.0942559242248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21927295625209808, "step": 10052 }, { "epoch": 0.20108, "grad_norm": 2.0, "grad_norm_var": 0.005771636962890625, "learning_rate": 0.0001, "loss": 4.036, "loss/crossentropy": 1.868508517742157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838754191994667, "step": 10054 }, { "epoch": 0.20112, "grad_norm": 2.15625, "grad_norm_var": 0.0058095296223958336, "learning_rate": 0.0001, "loss": 4.2101, "loss/crossentropy": 2.028561532497406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21768346428871155, "step": 10056 }, { "epoch": 0.20116, "grad_norm": 1.9375, "grad_norm_var": 0.006669108072916667, "learning_rate": 0.0001, "loss": 3.9803, "loss/crossentropy": 2.3005030155181885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23332027345895767, "step": 10058 }, { "epoch": 0.2012, "grad_norm": 2.03125, "grad_norm_var": 0.0065826416015625, "learning_rate": 0.0001, "loss": 4.089, "loss/crossentropy": 1.7793474793434143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18733646720647812, "step": 10060 }, { "epoch": 0.20124, "grad_norm": 2.015625, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 4.0197, "loss/crossentropy": 2.0612844228744507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21674886345863342, "step": 10062 }, { "epoch": 0.20128, "grad_norm": 2.078125, "grad_norm_var": 0.004500071207682292, "learning_rate": 0.0001, "loss": 4.237, "loss/crossentropy": 2.3219568729400635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20989079773426056, "step": 10064 }, { "epoch": 0.20132, "grad_norm": 2.078125, "grad_norm_var": 0.004659016927083333, "learning_rate": 0.0001, "loss": 4.0959, "loss/crossentropy": 2.0941001176834106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19992788136005402, "step": 10066 }, { "epoch": 0.20136, "grad_norm": 2.09375, "grad_norm_var": 0.0038330078125, "learning_rate": 0.0001, "loss": 4.053, "loss/crossentropy": 1.7448341250419617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108384072780609, "step": 10068 }, { "epoch": 0.2014, "grad_norm": 2.0625, "grad_norm_var": 0.0059397379557291664, "learning_rate": 0.0001, "loss": 4.263, "loss/crossentropy": 2.0475903749465942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19875742495059967, "step": 10070 }, { "epoch": 0.20144, "grad_norm": 2.140625, "grad_norm_var": 0.0499755859375, "learning_rate": 0.0001, "loss": 4.1014, "loss/crossentropy": 2.0624433755874634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20577051490545273, "step": 10072 }, { "epoch": 0.20148, "grad_norm": 2.03125, "grad_norm_var": 0.04807840983072917, "learning_rate": 0.0001, "loss": 4.3612, "loss/crossentropy": 2.194978952407837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19964048266410828, "step": 10074 }, { "epoch": 0.20152, "grad_norm": 2.0625, "grad_norm_var": 0.04691162109375, "learning_rate": 0.0001, "loss": 4.4025, "loss/crossentropy": 2.011984169483185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22259068489074707, "step": 10076 }, { "epoch": 0.20156, "grad_norm": 2.203125, "grad_norm_var": 0.0458740234375, "learning_rate": 0.0001, "loss": 4.3539, "loss/crossentropy": 2.098285675048828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232050970196724, "step": 10078 }, { "epoch": 0.2016, "grad_norm": 1.9609375, "grad_norm_var": 0.04423726399739583, "learning_rate": 0.0001, "loss": 4.2772, "loss/crossentropy": 2.313677191734314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2224883735179901, "step": 10080 }, { "epoch": 0.20164, "grad_norm": 1.953125, "grad_norm_var": 0.04454523722330729, "learning_rate": 0.0001, "loss": 4.1523, "loss/crossentropy": 2.3712470531463623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23282987624406815, "step": 10082 }, { "epoch": 0.20168, "grad_norm": 2.125, "grad_norm_var": 0.043342844645182295, "learning_rate": 0.0001, "loss": 4.3173, "loss/crossentropy": 2.204525947570801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2384341061115265, "step": 10084 }, { "epoch": 0.20172, "grad_norm": 1.9453125, "grad_norm_var": 0.04528401692708333, "learning_rate": 0.0001, "loss": 3.9497, "loss/crossentropy": 1.8102558851242065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17472535371780396, "step": 10086 }, { "epoch": 0.20176, "grad_norm": 2.046875, "grad_norm_var": 0.005711873372395833, "learning_rate": 0.0001, "loss": 4.2119, "loss/crossentropy": 2.041890263557434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21201669424772263, "step": 10088 }, { "epoch": 0.2018, "grad_norm": 2.125, "grad_norm_var": 0.006026204427083333, "learning_rate": 0.0001, "loss": 4.3869, "loss/crossentropy": 1.9904854893684387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23453570157289505, "step": 10090 }, { "epoch": 0.20184, "grad_norm": 2.0625, "grad_norm_var": 0.00716552734375, "learning_rate": 0.0001, "loss": 4.4324, "loss/crossentropy": 2.199320912361145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22087062150239944, "step": 10092 }, { "epoch": 0.20188, "grad_norm": 2.203125, "grad_norm_var": 0.0073150634765625, "learning_rate": 0.0001, "loss": 4.392, "loss/crossentropy": 2.321953535079956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2463374137878418, "step": 10094 }, { "epoch": 0.20192, "grad_norm": 2.046875, "grad_norm_var": 0.006076812744140625, "learning_rate": 0.0001, "loss": 4.3276, "loss/crossentropy": 2.1109840869903564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23713566362857819, "step": 10096 }, { "epoch": 0.20196, "grad_norm": 1.9375, "grad_norm_var": 0.006322987874348958, "learning_rate": 0.0001, "loss": 3.9339, "loss/crossentropy": 1.9126858711242676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20392102003097534, "step": 10098 }, { "epoch": 0.202, "grad_norm": 2.09375, "grad_norm_var": 0.006268056233723959, "learning_rate": 0.0001, "loss": 4.2902, "loss/crossentropy": 2.073318660259247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20465004444122314, "step": 10100 }, { "epoch": 0.20204, "grad_norm": 2.390625, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 4.1883, "loss/crossentropy": 1.7532709836959839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591036558151245, "step": 10102 }, { "epoch": 0.20208, "grad_norm": 2.21875, "grad_norm_var": 0.010660807291666666, "learning_rate": 0.0001, "loss": 4.3088, "loss/crossentropy": 2.1874141693115234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.227812297642231, "step": 10104 }, { "epoch": 0.20212, "grad_norm": 2.015625, "grad_norm_var": 0.014095052083333334, "learning_rate": 0.0001, "loss": 4.4738, "loss/crossentropy": 2.522923469543457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24104881286621094, "step": 10106 }, { "epoch": 0.20216, "grad_norm": 2.21875, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0001, "loss": 4.2823, "loss/crossentropy": 1.9359918236732483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19563085585832596, "step": 10108 }, { "epoch": 0.2022, "grad_norm": 2.015625, "grad_norm_var": 0.015315755208333334, "learning_rate": 0.0001, "loss": 4.3582, "loss/crossentropy": 2.390757203102112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24748887866735458, "step": 10110 }, { "epoch": 0.20224, "grad_norm": 2.0625, "grad_norm_var": 0.015013631184895833, "learning_rate": 0.0001, "loss": 4.5576, "loss/crossentropy": 2.419153571128845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2228071466088295, "step": 10112 }, { "epoch": 0.20228, "grad_norm": 2.03125, "grad_norm_var": 0.015404256184895833, "learning_rate": 0.0001, "loss": 4.0235, "loss/crossentropy": 1.7460771799087524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19431117177009583, "step": 10114 }, { "epoch": 0.20232, "grad_norm": 2.140625, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 4.601, "loss/crossentropy": 2.308094024658203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24104833602905273, "step": 10116 }, { "epoch": 0.20236, "grad_norm": 2.015625, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.0273314118385315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2325449138879776, "step": 10118 }, { "epoch": 0.2024, "grad_norm": 1.9453125, "grad_norm_var": 0.013792928059895833, "learning_rate": 0.0001, "loss": 4.016, "loss/crossentropy": 2.1261476278305054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21481933444738388, "step": 10120 }, { "epoch": 0.20244, "grad_norm": 2.03125, "grad_norm_var": 0.0107818603515625, "learning_rate": 0.0001, "loss": 4.478, "loss/crossentropy": 2.5006214380264282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25569023191928864, "step": 10122 }, { "epoch": 0.20248, "grad_norm": 1.8671875, "grad_norm_var": 0.011775461832682292, "learning_rate": 0.0001, "loss": 4.3208, "loss/crossentropy": 2.2033116817474365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2601129561662674, "step": 10124 }, { "epoch": 0.20252, "grad_norm": 2.09375, "grad_norm_var": 0.011572011311848958, "learning_rate": 0.0001, "loss": 4.1098, "loss/crossentropy": 2.2208765745162964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20958464592695236, "step": 10126 }, { "epoch": 0.20256, "grad_norm": 1.9765625, "grad_norm_var": 0.011848958333333333, "learning_rate": 0.0001, "loss": 3.943, "loss/crossentropy": 1.7752392888069153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22697371244430542, "step": 10128 }, { "epoch": 0.2026, "grad_norm": 2.0625, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 4.0529, "loss/crossentropy": 1.6444379687309265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18721824884414673, "step": 10130 }, { "epoch": 0.20264, "grad_norm": 2.046875, "grad_norm_var": 0.007271321614583334, "learning_rate": 0.0001, "loss": 4.1796, "loss/crossentropy": 1.7681297659873962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24296899884939194, "step": 10132 }, { "epoch": 0.20268, "grad_norm": 2.015625, "grad_norm_var": 0.007614898681640625, "learning_rate": 0.0001, "loss": 4.1593, "loss/crossentropy": 1.8710024952888489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891346201300621, "step": 10134 }, { "epoch": 0.20272, "grad_norm": 2.125, "grad_norm_var": 0.007515462239583334, "learning_rate": 0.0001, "loss": 4.0822, "loss/crossentropy": 2.0258530974388123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22128118574619293, "step": 10136 }, { "epoch": 0.20276, "grad_norm": 2.015625, "grad_norm_var": 1.7479237874348958, "learning_rate": 0.0001, "loss": 4.3091, "loss/crossentropy": 2.3366141319274902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2612725794315338, "step": 10138 }, { "epoch": 0.2028, "grad_norm": 2.0625, "grad_norm_var": 1.739208730061849, "learning_rate": 0.0001, "loss": 4.357, "loss/crossentropy": 2.152353823184967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239176332950592, "step": 10140 }, { "epoch": 0.20284, "grad_norm": 2.15625, "grad_norm_var": 1.7346433003743489, "learning_rate": 0.0001, "loss": 4.2552, "loss/crossentropy": 1.8189843893051147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21529949456453323, "step": 10142 }, { "epoch": 0.20288, "grad_norm": 2.03125, "grad_norm_var": 1.7219970703125, "learning_rate": 0.0001, "loss": 4.3179, "loss/crossentropy": 2.308253049850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21748851984739304, "step": 10144 }, { "epoch": 0.20292, "grad_norm": 2.015625, "grad_norm_var": 1.7289377848307292, "learning_rate": 0.0001, "loss": 4.1762, "loss/crossentropy": 1.953293800354004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948333978652954, "step": 10146 }, { "epoch": 0.20296, "grad_norm": 1.953125, "grad_norm_var": 1.726512654622396, "learning_rate": 0.0001, "loss": 3.9573, "loss/crossentropy": 2.1122325658798218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085581198334694, "step": 10148 }, { "epoch": 0.203, "grad_norm": 2.0625, "grad_norm_var": 1.716387685139974, "learning_rate": 0.0001, "loss": 4.4209, "loss/crossentropy": 2.450512409210205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23670841753482819, "step": 10150 }, { "epoch": 0.20304, "grad_norm": 1.984375, "grad_norm_var": 1.7187327067057292, "learning_rate": 0.0001, "loss": 4.1131, "loss/crossentropy": 2.3858957290649414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23381955921649933, "step": 10152 }, { "epoch": 0.20308, "grad_norm": 2.0625, "grad_norm_var": 0.0060618082682291664, "learning_rate": 0.0001, "loss": 4.2154, "loss/crossentropy": 2.1214572191238403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22399096935987473, "step": 10154 }, { "epoch": 0.20312, "grad_norm": 1.90625, "grad_norm_var": 0.0082916259765625, "learning_rate": 0.0001, "loss": 4.0487, "loss/crossentropy": 1.9299064874649048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19886164367198944, "step": 10156 }, { "epoch": 0.20316, "grad_norm": 2.234375, "grad_norm_var": 0.009877268473307292, "learning_rate": 0.0001, "loss": 4.108, "loss/crossentropy": 1.9582993388175964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1859385445713997, "step": 10158 }, { "epoch": 0.2032, "grad_norm": 2.15625, "grad_norm_var": 0.008790842692057292, "learning_rate": 0.0001, "loss": 4.408, "loss/crossentropy": 2.0555814504623413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25231435894966125, "step": 10160 }, { "epoch": 0.20324, "grad_norm": 2.109375, "grad_norm_var": 0.009090169270833334, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 1.9849395155906677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20529592037200928, "step": 10162 }, { "epoch": 0.20328, "grad_norm": 2.125, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 4.5619, "loss/crossentropy": 1.9529971480369568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997845619916916, "step": 10164 }, { "epoch": 0.20332, "grad_norm": 1.953125, "grad_norm_var": 0.009330240885416667, "learning_rate": 0.0001, "loss": 4.3117, "loss/crossentropy": 1.9885541200637817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19847019761800766, "step": 10166 }, { "epoch": 0.20336, "grad_norm": 2.21875, "grad_norm_var": 0.010013834635416666, "learning_rate": 0.0001, "loss": 4.3431, "loss/crossentropy": 1.8039852380752563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19332807511091232, "step": 10168 }, { "epoch": 0.2034, "grad_norm": 2.125, "grad_norm_var": 0.01004638671875, "learning_rate": 0.0001, "loss": 4.4287, "loss/crossentropy": 2.1633352041244507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22904948145151138, "step": 10170 }, { "epoch": 0.20344, "grad_norm": 2.046875, "grad_norm_var": 0.0070953369140625, "learning_rate": 0.0001, "loss": 4.1148, "loss/crossentropy": 2.2646039724349976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22390951961278915, "step": 10172 }, { "epoch": 0.20348, "grad_norm": 2.015625, "grad_norm_var": 0.005741119384765625, "learning_rate": 0.0001, "loss": 4.4114, "loss/crossentropy": 2.604608416557312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25521157681941986, "step": 10174 }, { "epoch": 0.20352, "grad_norm": 2.125, "grad_norm_var": 0.016257476806640626, "learning_rate": 0.0001, "loss": 4.3319, "loss/crossentropy": 2.286113977432251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23809745162725449, "step": 10176 }, { "epoch": 0.20356, "grad_norm": 1.9453125, "grad_norm_var": 0.01777928670247396, "learning_rate": 0.0001, "loss": 3.9951, "loss/crossentropy": 2.0746694207191467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21103999018669128, "step": 10178 }, { "epoch": 0.2036, "grad_norm": 2.0, "grad_norm_var": 0.018381500244140626, "learning_rate": 0.0001, "loss": 3.9892, "loss/crossentropy": 1.9076440930366516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048335075378418, "step": 10180 }, { "epoch": 0.20364, "grad_norm": 2.0, "grad_norm_var": 0.0178863525390625, "learning_rate": 0.0001, "loss": 4.1768, "loss/crossentropy": 1.8896904587745667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20643991231918335, "step": 10182 }, { "epoch": 0.20368, "grad_norm": 2.0625, "grad_norm_var": 0.016747029622395833, "learning_rate": 0.0001, "loss": 4.215, "loss/crossentropy": 2.156657338142395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22853681445121765, "step": 10184 }, { "epoch": 0.20372, "grad_norm": 2.203125, "grad_norm_var": 0.017772420247395834, "learning_rate": 0.0001, "loss": 4.6307, "loss/crossentropy": 2.3767203092575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2422076091170311, "step": 10186 }, { "epoch": 0.20376, "grad_norm": 1.9921875, "grad_norm_var": 0.01834894816080729, "learning_rate": 0.0001, "loss": 4.3402, "loss/crossentropy": 2.5728834867477417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22912005335092545, "step": 10188 }, { "epoch": 0.2038, "grad_norm": 1.9453125, "grad_norm_var": 0.02005182902018229, "learning_rate": 0.0001, "loss": 4.11, "loss/crossentropy": 2.1542125940322876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19575025141239166, "step": 10190 }, { "epoch": 0.20384, "grad_norm": 1.9765625, "grad_norm_var": 0.004610188802083333, "learning_rate": 0.0001, "loss": 3.9114, "loss/crossentropy": 1.5845852494239807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845107451081276, "step": 10192 }, { "epoch": 0.20388, "grad_norm": 2.34375, "grad_norm_var": 0.010957590738932292, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 2.1166247129440308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352868989109993, "step": 10194 }, { "epoch": 0.20392, "grad_norm": 2.0, "grad_norm_var": 0.010941314697265624, "learning_rate": 0.0001, "loss": 4.3374, "loss/crossentropy": 2.0129401683807373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22231722623109818, "step": 10196 }, { "epoch": 0.20396, "grad_norm": 2.125, "grad_norm_var": 0.010595703125, "learning_rate": 0.0001, "loss": 3.9709, "loss/crossentropy": 1.7132073044776917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18660317361354828, "step": 10198 }, { "epoch": 0.204, "grad_norm": 2.21875, "grad_norm_var": 0.016743977864583332, "learning_rate": 0.0001, "loss": 4.3959, "loss/crossentropy": 2.0064845085144043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20879874378442764, "step": 10200 }, { "epoch": 0.20404, "grad_norm": 1.9140625, "grad_norm_var": 0.01803766886393229, "learning_rate": 0.0001, "loss": 4.3558, "loss/crossentropy": 2.4205944538116455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24201631546020508, "step": 10202 }, { "epoch": 0.20408, "grad_norm": 1.9375, "grad_norm_var": 0.019694010416666668, "learning_rate": 0.0001, "loss": 3.8977, "loss/crossentropy": 2.0392255187034607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21578273177146912, "step": 10204 }, { "epoch": 0.20412, "grad_norm": 2.25, "grad_norm_var": 0.019230143229166666, "learning_rate": 0.0001, "loss": 4.1414, "loss/crossentropy": 1.8652849197387695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1896674558520317, "step": 10206 }, { "epoch": 0.20416, "grad_norm": 2.40625, "grad_norm_var": 0.02332331339518229, "learning_rate": 0.0001, "loss": 4.7698, "loss/crossentropy": 2.016683042049408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23144973814487457, "step": 10208 }, { "epoch": 0.2042, "grad_norm": 2.046875, "grad_norm_var": 0.020401763916015624, "learning_rate": 0.0001, "loss": 4.3691, "loss/crossentropy": 1.9395010471343994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19717292487621307, "step": 10210 }, { "epoch": 0.20424, "grad_norm": 2.234375, "grad_norm_var": 0.0210845947265625, "learning_rate": 0.0001, "loss": 4.0907, "loss/crossentropy": 1.7626919150352478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200415201485157, "step": 10212 }, { "epoch": 0.20428, "grad_norm": 2.171875, "grad_norm_var": 0.021955362955729165, "learning_rate": 0.0001, "loss": 4.3411, "loss/crossentropy": 2.3014339208602905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22324562072753906, "step": 10214 }, { "epoch": 0.20432, "grad_norm": 1.921875, "grad_norm_var": 0.019636027018229165, "learning_rate": 0.0001, "loss": 4.1237, "loss/crossentropy": 1.906779408454895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223847657442093, "step": 10216 }, { "epoch": 0.20436, "grad_norm": 2.046875, "grad_norm_var": 0.01789118448893229, "learning_rate": 0.0001, "loss": 4.4555, "loss/crossentropy": 2.085246205329895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21039249747991562, "step": 10218 }, { "epoch": 0.2044, "grad_norm": 2.171875, "grad_norm_var": 0.014806874593098958, "learning_rate": 0.0001, "loss": 4.4477, "loss/crossentropy": 2.213107645511627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22880114614963531, "step": 10220 }, { "epoch": 0.20444, "grad_norm": 2.078125, "grad_norm_var": 0.013392893473307292, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 1.9510034322738647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20935232937335968, "step": 10222 }, { "epoch": 0.20448, "grad_norm": 1.921875, "grad_norm_var": 0.008906809488932292, "learning_rate": 0.0001, "loss": 4.1237, "loss/crossentropy": 1.8595823645591736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20829308032989502, "step": 10224 }, { "epoch": 0.20452, "grad_norm": 2.015625, "grad_norm_var": 0.008990224202473958, "learning_rate": 0.0001, "loss": 4.1287, "loss/crossentropy": 1.8250519037246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18982955813407898, "step": 10226 }, { "epoch": 0.20456, "grad_norm": 2.15625, "grad_norm_var": 0.0074859619140625, "learning_rate": 0.0001, "loss": 4.3656, "loss/crossentropy": 2.410372495651245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22655458748340607, "step": 10228 }, { "epoch": 0.2046, "grad_norm": 1.9453125, "grad_norm_var": 0.007533518473307291, "learning_rate": 0.0001, "loss": 4.2581, "loss/crossentropy": 2.321051836013794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280098795890808, "step": 10230 }, { "epoch": 0.20464, "grad_norm": 1.9296875, "grad_norm_var": 0.007380167643229167, "learning_rate": 0.0001, "loss": 3.9927, "loss/crossentropy": 2.266388177871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2341095432639122, "step": 10232 }, { "epoch": 0.20468, "grad_norm": 2.15625, "grad_norm_var": 0.007478841145833333, "learning_rate": 0.0001, "loss": 4.5263, "loss/crossentropy": 2.390430450439453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23684432357549667, "step": 10234 }, { "epoch": 0.20472, "grad_norm": 3.390625, "grad_norm_var": 0.11719563802083334, "learning_rate": 0.0001, "loss": 4.4269, "loss/crossentropy": 2.07179594039917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21902770549058914, "step": 10236 }, { "epoch": 0.20476, "grad_norm": 2.1875, "grad_norm_var": 0.12704264322916667, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 1.686942458152771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20007195323705673, "step": 10238 }, { "epoch": 0.2048, "grad_norm": 2.0, "grad_norm_var": 0.26913248697916664, "learning_rate": 0.0001, "loss": 4.2375, "loss/crossentropy": 2.0421791076660156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187313288450241, "step": 10240 }, { "epoch": 0.20484, "grad_norm": 2.28125, "grad_norm_var": 0.26201985677083334, "learning_rate": 0.0001, "loss": 4.1641, "loss/crossentropy": 1.9503712058067322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879397749900818, "step": 10242 }, { "epoch": 0.20488, "grad_norm": 3.21875, "grad_norm_var": 0.82620849609375, "learning_rate": 0.0001, "loss": 4.413, "loss/crossentropy": 2.1453936100006104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22767861932516098, "step": 10244 }, { "epoch": 0.20492, "grad_norm": 2.171875, "grad_norm_var": 0.808221181233724, "learning_rate": 0.0001, "loss": 4.1924, "loss/crossentropy": 1.9007731080055237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20069654285907745, "step": 10246 }, { "epoch": 0.20496, "grad_norm": 1.875, "grad_norm_var": 0.8184832255045573, "learning_rate": 0.0001, "loss": 4.0685, "loss/crossentropy": 2.0545393228530884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983654722571373, "step": 10248 }, { "epoch": 0.205, "grad_norm": 2.1875, "grad_norm_var": 0.804272206624349, "learning_rate": 0.0001, "loss": 4.3391, "loss/crossentropy": 2.158636450767517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24239712953567505, "step": 10250 }, { "epoch": 0.20504, "grad_norm": 2.15625, "grad_norm_var": 0.7762794494628906, "learning_rate": 0.0001, "loss": 4.1608, "loss/crossentropy": 2.1119120121002197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2136205956339836, "step": 10252 }, { "epoch": 0.20508, "grad_norm": 2.21875, "grad_norm_var": 0.7838417053222656, "learning_rate": 0.0001, "loss": 4.3184, "loss/crossentropy": 2.0690027475357056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20616184175014496, "step": 10254 }, { "epoch": 0.20512, "grad_norm": 2.0625, "grad_norm_var": 0.6926798502604167, "learning_rate": 0.0001, "loss": 4.0635, "loss/crossentropy": 2.178507924079895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21580064296722412, "step": 10256 }, { "epoch": 0.20516, "grad_norm": 2.140625, "grad_norm_var": 0.6889719645182292, "learning_rate": 0.0001, "loss": 4.1433, "loss/crossentropy": 2.292190670967102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22163032740354538, "step": 10258 }, { "epoch": 0.2052, "grad_norm": 2.015625, "grad_norm_var": 0.03568115234375, "learning_rate": 0.0001, "loss": 4.0138, "loss/crossentropy": 2.066649317741394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081037238240242, "step": 10260 }, { "epoch": 0.20524, "grad_norm": 2.296875, "grad_norm_var": 0.05111490885416667, "learning_rate": 0.0001, "loss": 4.4315, "loss/crossentropy": 1.9017595052719116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19284649938344955, "step": 10262 }, { "epoch": 0.20528, "grad_norm": 2.125, "grad_norm_var": 0.04326960245768229, "learning_rate": 0.0001, "loss": 4.0974, "loss/crossentropy": 2.1215697526931763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244422286748886, "step": 10264 }, { "epoch": 0.20532, "grad_norm": 2.03125, "grad_norm_var": 0.02539647420247396, "learning_rate": 0.0001, "loss": 4.3231, "loss/crossentropy": 2.170191764831543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308938354253769, "step": 10266 }, { "epoch": 0.20536, "grad_norm": 2.140625, "grad_norm_var": 0.02535985310872396, "learning_rate": 0.0001, "loss": 4.3472, "loss/crossentropy": 2.0430655479431152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435612976551056, "step": 10268 }, { "epoch": 0.2054, "grad_norm": 2.203125, "grad_norm_var": 0.025394439697265625, "learning_rate": 0.0001, "loss": 4.536, "loss/crossentropy": 2.3141634464263916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2234780564904213, "step": 10270 }, { "epoch": 0.20544, "grad_norm": 2.09375, "grad_norm_var": 0.023371378580729168, "learning_rate": 0.0001, "loss": 4.1944, "loss/crossentropy": 2.310709834098816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505229711532593, "step": 10272 }, { "epoch": 0.20548, "grad_norm": 2.078125, "grad_norm_var": 0.02434056599934896, "learning_rate": 0.0001, "loss": 4.0664, "loss/crossentropy": 1.9158611297607422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062816470861435, "step": 10274 }, { "epoch": 0.20552, "grad_norm": 2.015625, "grad_norm_var": 0.02697728474934896, "learning_rate": 0.0001, "loss": 4.0545, "loss/crossentropy": 2.0835859179496765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21051569283008575, "step": 10276 }, { "epoch": 0.20556, "grad_norm": 2.125, "grad_norm_var": 0.006951649983723958, "learning_rate": 0.0001, "loss": 4.4047, "loss/crossentropy": 1.9533037543296814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20007294416427612, "step": 10278 }, { "epoch": 0.2056, "grad_norm": 1.9453125, "grad_norm_var": 0.010701497395833334, "learning_rate": 0.0001, "loss": 4.4499, "loss/crossentropy": 2.3090076446533203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295464426279068, "step": 10280 }, { "epoch": 0.20564, "grad_norm": 2.171875, "grad_norm_var": 0.014989217122395834, "learning_rate": 0.0001, "loss": 4.2708, "loss/crossentropy": 2.2951393127441406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22297964990139008, "step": 10282 }, { "epoch": 0.20568, "grad_norm": 2.09375, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.311514675617218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2287898138165474, "step": 10284 }, { "epoch": 0.20572, "grad_norm": 1.921875, "grad_norm_var": 0.014989217122395834, "learning_rate": 0.0001, "loss": 4.2287, "loss/crossentropy": 2.277890205383301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21169421076774597, "step": 10286 }, { "epoch": 0.20576, "grad_norm": 2.109375, "grad_norm_var": 0.015143839518229167, "learning_rate": 0.0001, "loss": 4.4448, "loss/crossentropy": 2.070693612098694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21682647615671158, "step": 10288 }, { "epoch": 0.2058, "grad_norm": 1.984375, "grad_norm_var": 0.014788564046223958, "learning_rate": 0.0001, "loss": 4.205, "loss/crossentropy": 2.223360061645508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24135399609804153, "step": 10290 }, { "epoch": 0.20584, "grad_norm": 2.40625, "grad_norm_var": 0.018155670166015624, "learning_rate": 0.0001, "loss": 4.3304, "loss/crossentropy": 2.430101752281189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21579021215438843, "step": 10292 }, { "epoch": 0.20588, "grad_norm": 2.03125, "grad_norm_var": 0.018173980712890624, "learning_rate": 0.0001, "loss": 4.1615, "loss/crossentropy": 1.960309624671936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22619223594665527, "step": 10294 }, { "epoch": 0.20592, "grad_norm": 2.515625, "grad_norm_var": 0.024217732747395835, "learning_rate": 0.0001, "loss": 4.8862, "loss/crossentropy": 2.0035970211029053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20148956030607224, "step": 10296 }, { "epoch": 0.20596, "grad_norm": 2.09375, "grad_norm_var": 0.021968587239583334, "learning_rate": 0.0001, "loss": 4.175, "loss/crossentropy": 1.976987361907959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21035870164632797, "step": 10298 }, { "epoch": 0.206, "grad_norm": 2.140625, "grad_norm_var": 0.021675618489583333, "learning_rate": 0.0001, "loss": 4.1596, "loss/crossentropy": 2.0631470680236816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21725613623857498, "step": 10300 }, { "epoch": 0.20604, "grad_norm": 2.421875, "grad_norm_var": 0.02693456013997396, "learning_rate": 0.0001, "loss": 4.4734, "loss/crossentropy": 2.2747987508773804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295055389404297, "step": 10302 }, { "epoch": 0.20608, "grad_norm": 1.9609375, "grad_norm_var": 0.029255167643229166, "learning_rate": 0.0001, "loss": 4.1255, "loss/crossentropy": 2.0811264514923096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21890189498662949, "step": 10304 }, { "epoch": 0.20612, "grad_norm": 2.203125, "grad_norm_var": 0.028319295247395834, "learning_rate": 0.0001, "loss": 4.4391, "loss/crossentropy": 2.2474766969680786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2430667206645012, "step": 10306 }, { "epoch": 0.20616, "grad_norm": 2.03125, "grad_norm_var": 0.024933878580729166, "learning_rate": 0.0001, "loss": 4.3016, "loss/crossentropy": 1.899698257446289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21101202815771103, "step": 10308 }, { "epoch": 0.2062, "grad_norm": 2.078125, "grad_norm_var": 0.023851521809895835, "learning_rate": 0.0001, "loss": 3.9769, "loss/crossentropy": 1.6432967782020569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19004638493061066, "step": 10310 }, { "epoch": 0.20624, "grad_norm": 2.046875, "grad_norm_var": 0.0149658203125, "learning_rate": 0.0001, "loss": 4.3901, "loss/crossentropy": 1.9606398940086365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24546240270137787, "step": 10312 }, { "epoch": 0.20628, "grad_norm": 2.15625, "grad_norm_var": 0.015021769205729167, "learning_rate": 0.0001, "loss": 4.0988, "loss/crossentropy": 2.140414595603943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2428836077451706, "step": 10314 }, { "epoch": 0.20632, "grad_norm": 2.3125, "grad_norm_var": 0.018393707275390626, "learning_rate": 0.0001, "loss": 4.3768, "loss/crossentropy": 2.0886260271072388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201090231537819, "step": 10316 }, { "epoch": 0.20636, "grad_norm": 2.234375, "grad_norm_var": 0.1039947509765625, "learning_rate": 0.0001, "loss": 4.5696, "loss/crossentropy": 2.3409098386764526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24670831114053726, "step": 10318 }, { "epoch": 0.2064, "grad_norm": 2.09375, "grad_norm_var": 0.10114313761393229, "learning_rate": 0.0001, "loss": 4.4153, "loss/crossentropy": 2.233125150203705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21548831462860107, "step": 10320 }, { "epoch": 0.20644, "grad_norm": 2.0625, "grad_norm_var": 0.1031206766764323, "learning_rate": 0.0001, "loss": 4.0617, "loss/crossentropy": 2.131038188934326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20244313776493073, "step": 10322 }, { "epoch": 0.20648, "grad_norm": 2.03125, "grad_norm_var": 0.10423965454101562, "learning_rate": 0.0001, "loss": 4.3428, "loss/crossentropy": 2.1683152318000793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21852879226207733, "step": 10324 }, { "epoch": 0.20652, "grad_norm": 2.09375, "grad_norm_var": 0.10465672810872396, "learning_rate": 0.0001, "loss": 4.4117, "loss/crossentropy": 2.2986634969711304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24344030022621155, "step": 10326 }, { "epoch": 0.20656, "grad_norm": 1.9140625, "grad_norm_var": 0.10876057942708334, "learning_rate": 0.0001, "loss": 4.063, "loss/crossentropy": 2.2356297969818115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21090354025363922, "step": 10328 }, { "epoch": 0.2066, "grad_norm": 2.28125, "grad_norm_var": 0.10851949055989583, "learning_rate": 0.0001, "loss": 4.4235, "loss/crossentropy": 2.6431000232696533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26503315567970276, "step": 10330 }, { "epoch": 0.20664, "grad_norm": 2.125, "grad_norm_var": 0.10668919881184896, "learning_rate": 0.0001, "loss": 4.2277, "loss/crossentropy": 2.0261669754981995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21528497338294983, "step": 10332 }, { "epoch": 0.20668, "grad_norm": 2.15625, "grad_norm_var": 0.007755279541015625, "learning_rate": 0.0001, "loss": 4.1092, "loss/crossentropy": 1.5593605041503906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18069667369127274, "step": 10334 }, { "epoch": 0.20672, "grad_norm": 1.9296875, "grad_norm_var": 0.0087646484375, "learning_rate": 0.0001, "loss": 4.1836, "loss/crossentropy": 1.953243374824524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19982123374938965, "step": 10336 }, { "epoch": 0.20676, "grad_norm": 2.09375, "grad_norm_var": 0.0088531494140625, "learning_rate": 0.0001, "loss": 4.4369, "loss/crossentropy": 2.067806303501129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209654077887535, "step": 10338 }, { "epoch": 0.2068, "grad_norm": 2.0625, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 4.2458, "loss/crossentropy": 1.9948397874832153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21432363241910934, "step": 10340 }, { "epoch": 0.20684, "grad_norm": 2.046875, "grad_norm_var": 0.010231272379557291, "learning_rate": 0.0001, "loss": 3.7643, "loss/crossentropy": 1.7932568788528442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19161275029182434, "step": 10342 }, { "epoch": 0.20688, "grad_norm": 2.3125, "grad_norm_var": 0.0162017822265625, "learning_rate": 0.0001, "loss": 4.5325, "loss/crossentropy": 2.021036922931671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21150880306959152, "step": 10344 }, { "epoch": 0.20692, "grad_norm": 1.9453125, "grad_norm_var": 0.014623769124348958, "learning_rate": 0.0001, "loss": 3.9351, "loss/crossentropy": 2.0004186630249023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18971875309944153, "step": 10346 }, { "epoch": 0.20696, "grad_norm": 1.984375, "grad_norm_var": 0.014898427327473958, "learning_rate": 0.0001, "loss": 4.1234, "loss/crossentropy": 2.2949434518814087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356676608324051, "step": 10348 }, { "epoch": 0.207, "grad_norm": 2.15625, "grad_norm_var": 0.013108062744140624, "learning_rate": 0.0001, "loss": 4.1755, "loss/crossentropy": 2.161319613456726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24161628633737564, "step": 10350 }, { "epoch": 0.20704, "grad_norm": 1.921875, "grad_norm_var": 0.013285319010416666, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 2.027602195739746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19196761399507523, "step": 10352 }, { "epoch": 0.20708, "grad_norm": 2.109375, "grad_norm_var": 0.013703409830729167, "learning_rate": 0.0001, "loss": 4.3658, "loss/crossentropy": 2.163583278656006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21044812351465225, "step": 10354 }, { "epoch": 0.20712, "grad_norm": 1.9921875, "grad_norm_var": 0.014045969645182291, "learning_rate": 0.0001, "loss": 4.0419, "loss/crossentropy": 2.055150866508484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21587203443050385, "step": 10356 }, { "epoch": 0.20716, "grad_norm": 2.4375, "grad_norm_var": 0.0192047119140625, "learning_rate": 0.0001, "loss": 4.2947, "loss/crossentropy": 2.368631362915039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2406565323472023, "step": 10358 }, { "epoch": 0.2072, "grad_norm": 2.234375, "grad_norm_var": 0.016486612955729167, "learning_rate": 0.0001, "loss": 4.6473, "loss/crossentropy": 2.5399086475372314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2554238885641098, "step": 10360 }, { "epoch": 0.20724, "grad_norm": 2.046875, "grad_norm_var": 0.015366363525390624, "learning_rate": 0.0001, "loss": 4.2346, "loss/crossentropy": 2.0829185843467712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278829663991928, "step": 10362 }, { "epoch": 0.20728, "grad_norm": 2.078125, "grad_norm_var": 0.014371490478515625, "learning_rate": 0.0001, "loss": 4.2583, "loss/crossentropy": 1.9829052090644836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19677383452653885, "step": 10364 }, { "epoch": 0.20732, "grad_norm": 2.0625, "grad_norm_var": 0.014385732014973958, "learning_rate": 0.0001, "loss": 4.3801, "loss/crossentropy": 2.2335458993911743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23407185822725296, "step": 10366 }, { "epoch": 0.20736, "grad_norm": 2.046875, "grad_norm_var": 0.012149810791015625, "learning_rate": 0.0001, "loss": 4.2074, "loss/crossentropy": 1.824280858039856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203867107629776, "step": 10368 }, { "epoch": 0.2074, "grad_norm": 1.9765625, "grad_norm_var": 0.013898722330729167, "learning_rate": 0.0001, "loss": 4.0666, "loss/crossentropy": 2.1007773876190186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20312541723251343, "step": 10370 }, { "epoch": 0.20744, "grad_norm": 2.109375, "grad_norm_var": 0.013350168863932291, "learning_rate": 0.0001, "loss": 4.3692, "loss/crossentropy": 1.953888475894928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20697196573019028, "step": 10372 }, { "epoch": 0.20748, "grad_norm": 2.28125, "grad_norm_var": 0.03509089152018229, "learning_rate": 0.0001, "loss": 4.2786, "loss/crossentropy": 2.186478853225708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24410755187273026, "step": 10374 }, { "epoch": 0.20752, "grad_norm": 2.34375, "grad_norm_var": 0.03706232706705729, "learning_rate": 0.0001, "loss": 3.9974, "loss/crossentropy": 1.9011998772621155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983819529414177, "step": 10376 }, { "epoch": 0.20756, "grad_norm": 2.03125, "grad_norm_var": 0.03695246378580729, "learning_rate": 0.0001, "loss": 4.5884, "loss/crossentropy": 2.599787950515747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25155550241470337, "step": 10378 }, { "epoch": 0.2076, "grad_norm": 2.078125, "grad_norm_var": 0.037021636962890625, "learning_rate": 0.0001, "loss": 4.2702, "loss/crossentropy": 2.0120421648025513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281170845031738, "step": 10380 }, { "epoch": 0.20764, "grad_norm": 2.09375, "grad_norm_var": 0.036710357666015624, "learning_rate": 0.0001, "loss": 4.4915, "loss/crossentropy": 2.0685967803001404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23026303946971893, "step": 10382 }, { "epoch": 0.20768, "grad_norm": 2.125, "grad_norm_var": 0.0445068359375, "learning_rate": 0.0001, "loss": 4.1171, "loss/crossentropy": 1.8392394185066223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19240978360176086, "step": 10384 }, { "epoch": 0.20772, "grad_norm": 1.9453125, "grad_norm_var": 0.04523518880208333, "learning_rate": 0.0001, "loss": 4.2428, "loss/crossentropy": 1.786954402923584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20547360181808472, "step": 10386 }, { "epoch": 0.20776, "grad_norm": 2.234375, "grad_norm_var": 0.0452301025390625, "learning_rate": 0.0001, "loss": 4.2084, "loss/crossentropy": 1.6960806250572205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1748766005039215, "step": 10388 }, { "epoch": 0.2078, "grad_norm": 2.3125, "grad_norm_var": 0.018693033854166666, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 2.271879196166992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25173740088939667, "step": 10390 }, { "epoch": 0.20784, "grad_norm": 2.109375, "grad_norm_var": 0.015778605143229166, "learning_rate": 0.0001, "loss": 4.5199, "loss/crossentropy": 2.2860567569732666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23291928321123123, "step": 10392 }, { "epoch": 0.20788, "grad_norm": 2.046875, "grad_norm_var": 0.0148193359375, "learning_rate": 0.0001, "loss": 4.1871, "loss/crossentropy": 1.9925037026405334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212161086499691, "step": 10394 }, { "epoch": 0.20792, "grad_norm": 2.140625, "grad_norm_var": 0.016306304931640626, "learning_rate": 0.0001, "loss": 4.3728, "loss/crossentropy": 2.1189831495285034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19486143440008163, "step": 10396 }, { "epoch": 0.20796, "grad_norm": 1.9921875, "grad_norm_var": 0.0168121337890625, "learning_rate": 0.0001, "loss": 4.2325, "loss/crossentropy": 2.170132279396057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23083829134702682, "step": 10398 }, { "epoch": 0.208, "grad_norm": 2.109375, "grad_norm_var": 0.012129465738932291, "learning_rate": 0.0001, "loss": 4.0773, "loss/crossentropy": 1.8486035466194153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18999971449375153, "step": 10400 }, { "epoch": 0.20804, "grad_norm": 1.9765625, "grad_norm_var": 0.011240386962890625, "learning_rate": 0.0001, "loss": 4.138, "loss/crossentropy": 1.727788269519806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19234652817249298, "step": 10402 }, { "epoch": 0.20808, "grad_norm": 1.9921875, "grad_norm_var": 0.01046142578125, "learning_rate": 0.0001, "loss": 4.4102, "loss/crossentropy": 2.196265935897827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21071433275938034, "step": 10404 }, { "epoch": 0.20812, "grad_norm": 2.15625, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 4.3378, "loss/crossentropy": 1.7230273485183716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19570616632699966, "step": 10406 }, { "epoch": 0.20816, "grad_norm": 2.109375, "grad_norm_var": 0.00567626953125, "learning_rate": 0.0001, "loss": 3.9693, "loss/crossentropy": 1.8815646767616272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055530995130539, "step": 10408 }, { "epoch": 0.2082, "grad_norm": 2.015625, "grad_norm_var": 0.006400299072265625, "learning_rate": 0.0001, "loss": 4.1974, "loss/crossentropy": 2.2131329774856567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22710958123207092, "step": 10410 }, { "epoch": 0.20824, "grad_norm": 2.046875, "grad_norm_var": 0.006103515625, "learning_rate": 0.0001, "loss": 4.1904, "loss/crossentropy": 1.5199981927871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16881641000509262, "step": 10412 }, { "epoch": 0.20828, "grad_norm": 2.0625, "grad_norm_var": 0.005356597900390625, "learning_rate": 0.0001, "loss": 4.2456, "loss/crossentropy": 2.305867075920105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2523300349712372, "step": 10414 }, { "epoch": 0.20832, "grad_norm": 2.0625, "grad_norm_var": 0.004571278889973958, "learning_rate": 0.0001, "loss": 4.2727, "loss/crossentropy": 1.989980161190033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19124356657266617, "step": 10416 }, { "epoch": 0.20836, "grad_norm": 2.046875, "grad_norm_var": 0.004255167643229167, "learning_rate": 0.0001, "loss": 4.4745, "loss/crossentropy": 2.165328025817871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23673278093338013, "step": 10418 }, { "epoch": 0.2084, "grad_norm": 2.328125, "grad_norm_var": 0.008074696858723958, "learning_rate": 0.0001, "loss": 4.5429, "loss/crossentropy": 2.2451056241989136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23130114376544952, "step": 10420 }, { "epoch": 0.20844, "grad_norm": 1.953125, "grad_norm_var": 0.0106689453125, "learning_rate": 0.0001, "loss": 3.9259, "loss/crossentropy": 2.1694064140319824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21424317359924316, "step": 10422 }, { "epoch": 0.20848, "grad_norm": 2.1875, "grad_norm_var": 0.012059529622395834, "learning_rate": 0.0001, "loss": 4.3558, "loss/crossentropy": 2.140601873397827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22452331334352493, "step": 10424 }, { "epoch": 0.20852, "grad_norm": 2.125, "grad_norm_var": 0.011476389567057292, "learning_rate": 0.0001, "loss": 4.3058, "loss/crossentropy": 2.2076770067214966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21608063578605652, "step": 10426 }, { "epoch": 0.20856, "grad_norm": 2.125, "grad_norm_var": 0.011579386393229167, "learning_rate": 0.0001, "loss": 4.151, "loss/crossentropy": 2.1738568544387817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288571298122406, "step": 10428 }, { "epoch": 0.2086, "grad_norm": 2.015625, "grad_norm_var": 0.012165323893229166, "learning_rate": 0.0001, "loss": 4.2576, "loss/crossentropy": 2.2586612701416016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22107571363449097, "step": 10430 }, { "epoch": 0.20864, "grad_norm": 1.9609375, "grad_norm_var": 0.014511871337890624, "learning_rate": 0.0001, "loss": 4.5441, "loss/crossentropy": 2.336306095123291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23462744057178497, "step": 10432 }, { "epoch": 0.20868, "grad_norm": 2.046875, "grad_norm_var": 0.0148590087890625, "learning_rate": 0.0001, "loss": 4.1625, "loss/crossentropy": 2.3164994716644287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21541497856378555, "step": 10434 }, { "epoch": 0.20872, "grad_norm": 2.046875, "grad_norm_var": 0.0121734619140625, "learning_rate": 0.0001, "loss": 4.2497, "loss/crossentropy": 1.848636507987976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20993927866220474, "step": 10436 }, { "epoch": 0.20876, "grad_norm": 2.25, "grad_norm_var": 0.010509999593098958, "learning_rate": 0.0001, "loss": 4.4985, "loss/crossentropy": 2.234964370727539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21675845235586166, "step": 10438 }, { "epoch": 0.2088, "grad_norm": 2.125, "grad_norm_var": 0.009159088134765625, "learning_rate": 0.0001, "loss": 4.2389, "loss/crossentropy": 1.9301238656044006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21708600223064423, "step": 10440 }, { "epoch": 0.20884, "grad_norm": 2.171875, "grad_norm_var": 0.010721842447916666, "learning_rate": 0.0001, "loss": 4.1706, "loss/crossentropy": 2.231620192527771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23156532645225525, "step": 10442 }, { "epoch": 0.20888, "grad_norm": 2.015625, "grad_norm_var": 0.010237375895182291, "learning_rate": 0.0001, "loss": 4.18, "loss/crossentropy": 1.8612747192382812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914522647857666, "step": 10444 }, { "epoch": 0.20892, "grad_norm": 2.109375, "grad_norm_var": 0.009907786051432292, "learning_rate": 0.0001, "loss": 4.1713, "loss/crossentropy": 2.2229456305503845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23199696838855743, "step": 10446 }, { "epoch": 0.20896, "grad_norm": 2.046875, "grad_norm_var": 0.009708658854166666, "learning_rate": 0.0001, "loss": 4.0556, "loss/crossentropy": 1.9055940508842468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158679962158203, "step": 10448 }, { "epoch": 0.209, "grad_norm": 1.8984375, "grad_norm_var": 0.011774698893229166, "learning_rate": 0.0001, "loss": 3.8876, "loss/crossentropy": 1.5537404417991638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17176809161901474, "step": 10450 }, { "epoch": 0.20904, "grad_norm": 1.890625, "grad_norm_var": 0.0123931884765625, "learning_rate": 0.0001, "loss": 3.9222, "loss/crossentropy": 2.319318413734436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001250088214874, "step": 10452 }, { "epoch": 0.20908, "grad_norm": 2.140625, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 4.421, "loss/crossentropy": 2.2334396839141846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.222968190908432, "step": 10454 }, { "epoch": 0.20912, "grad_norm": 2.109375, "grad_norm_var": 0.0110260009765625, "learning_rate": 0.0001, "loss": 4.5172, "loss/crossentropy": 2.5762773752212524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26605312526226044, "step": 10456 }, { "epoch": 0.20916, "grad_norm": 2.0, "grad_norm_var": 0.010503896077473958, "learning_rate": 0.0001, "loss": 4.1109, "loss/crossentropy": 1.7634761333465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18641388416290283, "step": 10458 }, { "epoch": 0.2092, "grad_norm": 1.875, "grad_norm_var": 0.013224029541015625, "learning_rate": 0.0001, "loss": 4.126, "loss/crossentropy": 2.28191876411438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22916647791862488, "step": 10460 }, { "epoch": 0.20924, "grad_norm": 2.125, "grad_norm_var": 0.013903554280598958, "learning_rate": 0.0001, "loss": 4.1984, "loss/crossentropy": 1.9712265729904175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2293689101934433, "step": 10462 }, { "epoch": 0.20928, "grad_norm": 2.109375, "grad_norm_var": 0.012292226155598959, "learning_rate": 0.0001, "loss": 4.3826, "loss/crossentropy": 2.425857424736023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.255269430577755, "step": 10464 }, { "epoch": 0.20932, "grad_norm": 2.109375, "grad_norm_var": 0.011031087239583333, "learning_rate": 0.0001, "loss": 4.1919, "loss/crossentropy": 2.0697131752967834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22233501076698303, "step": 10466 }, { "epoch": 0.20936, "grad_norm": 2.140625, "grad_norm_var": 0.008854166666666666, "learning_rate": 0.0001, "loss": 4.4017, "loss/crossentropy": 1.8367178440093994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063748985528946, "step": 10468 }, { "epoch": 0.2094, "grad_norm": 2.046875, "grad_norm_var": 0.018192545572916666, "learning_rate": 0.0001, "loss": 4.1864, "loss/crossentropy": 1.977232813835144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20568673312664032, "step": 10470 }, { "epoch": 0.20944, "grad_norm": 2.0, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 4.1622, "loss/crossentropy": 2.1934465169906616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175101339817047, "step": 10472 }, { "epoch": 0.20948, "grad_norm": 2.09375, "grad_norm_var": 0.016047159830729168, "learning_rate": 0.0001, "loss": 4.3005, "loss/crossentropy": 1.7997339367866516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20166774094104767, "step": 10474 }, { "epoch": 0.20952, "grad_norm": 2.140625, "grad_norm_var": 0.014671834309895833, "learning_rate": 0.0001, "loss": 4.1167, "loss/crossentropy": 1.9334582090377808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19190441817045212, "step": 10476 }, { "epoch": 0.20956, "grad_norm": 2.109375, "grad_norm_var": 0.015148671468098958, "learning_rate": 0.0001, "loss": 3.9434, "loss/crossentropy": 1.7263885140419006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17968116700649261, "step": 10478 }, { "epoch": 0.2096, "grad_norm": 1.9375, "grad_norm_var": 0.01661961873372396, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.1710296869277954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19717055559158325, "step": 10480 }, { "epoch": 0.20964, "grad_norm": 2.171875, "grad_norm_var": 0.016947174072265626, "learning_rate": 0.0001, "loss": 4.2089, "loss/crossentropy": 2.043896973133087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21403591334819794, "step": 10482 }, { "epoch": 0.20968, "grad_norm": 2.203125, "grad_norm_var": 0.02459691365559896, "learning_rate": 0.0001, "loss": 3.9237, "loss/crossentropy": 1.7813313603401184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20273292809724808, "step": 10484 }, { "epoch": 0.20972, "grad_norm": 2.0625, "grad_norm_var": 0.013155110677083333, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 1.8628470301628113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945454627275467, "step": 10486 }, { "epoch": 0.20976, "grad_norm": 2.03125, "grad_norm_var": 0.013044230143229167, "learning_rate": 0.0001, "loss": 4.2279, "loss/crossentropy": 2.2047882080078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23798923939466476, "step": 10488 }, { "epoch": 0.2098, "grad_norm": 1.9609375, "grad_norm_var": 0.012697092692057292, "learning_rate": 0.0001, "loss": 4.1327, "loss/crossentropy": 1.8838441967964172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2224583625793457, "step": 10490 }, { "epoch": 0.20984, "grad_norm": 2.015625, "grad_norm_var": 0.011201731363932292, "learning_rate": 0.0001, "loss": 4.1985, "loss/crossentropy": 1.9326539039611816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116800993680954, "step": 10492 }, { "epoch": 0.20988, "grad_norm": 2.375, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 4.2023, "loss/crossentropy": 1.7767577171325684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20496132969856262, "step": 10494 }, { "epoch": 0.20992, "grad_norm": 2.015625, "grad_norm_var": 0.016996256510416665, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 1.996176838874817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979241669178009, "step": 10496 }, { "epoch": 0.20996, "grad_norm": 2.015625, "grad_norm_var": 0.016136678059895833, "learning_rate": 0.0001, "loss": 4.1467, "loss/crossentropy": 2.5282262563705444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23003733158111572, "step": 10498 }, { "epoch": 0.21, "grad_norm": 2.109375, "grad_norm_var": 0.008760579427083333, "learning_rate": 0.0001, "loss": 4.1981, "loss/crossentropy": 2.3120675086975098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22543538361787796, "step": 10500 }, { "epoch": 0.21004, "grad_norm": 2.15625, "grad_norm_var": 0.008766428629557291, "learning_rate": 0.0001, "loss": 4.1216, "loss/crossentropy": 2.2110280990600586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066550493240356, "step": 10502 }, { "epoch": 0.21008, "grad_norm": 2.015625, "grad_norm_var": 0.009242502848307292, "learning_rate": 0.0001, "loss": 4.0804, "loss/crossentropy": 1.6792908906936646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20689093321561813, "step": 10504 }, { "epoch": 0.21012, "grad_norm": 2.296875, "grad_norm_var": 0.011116536458333333, "learning_rate": 0.0001, "loss": 4.137, "loss/crossentropy": 1.9797767400741577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2185518741607666, "step": 10506 }, { "epoch": 0.21016, "grad_norm": 2.046875, "grad_norm_var": 0.011533355712890625, "learning_rate": 0.0001, "loss": 4.0243, "loss/crossentropy": 2.0001984238624573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22058366239070892, "step": 10508 }, { "epoch": 0.2102, "grad_norm": 2.125, "grad_norm_var": 0.006414540608723958, "learning_rate": 0.0001, "loss": 4.3707, "loss/crossentropy": 2.1622806787490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21733585745096207, "step": 10510 }, { "epoch": 0.21024, "grad_norm": 2.03125, "grad_norm_var": 0.00740966796875, "learning_rate": 0.0001, "loss": 4.0392, "loss/crossentropy": 2.0224735736846924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20596059411764145, "step": 10512 }, { "epoch": 0.21028, "grad_norm": 1.8359375, "grad_norm_var": 0.011244455973307291, "learning_rate": 0.0001, "loss": 4.0712, "loss/crossentropy": 2.099945902824402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19777391105890274, "step": 10514 }, { "epoch": 0.21032, "grad_norm": 1.9921875, "grad_norm_var": 0.011823527018229167, "learning_rate": 0.0001, "loss": 4.1246, "loss/crossentropy": 1.9806578159332275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19959519058465958, "step": 10516 }, { "epoch": 0.21036, "grad_norm": 2.140625, "grad_norm_var": 0.012239329020182292, "learning_rate": 0.0001, "loss": 4.1182, "loss/crossentropy": 2.0056468844413757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21810457110404968, "step": 10518 }, { "epoch": 0.2104, "grad_norm": 2.171875, "grad_norm_var": 0.01785456339518229, "learning_rate": 0.0001, "loss": 4.1085, "loss/crossentropy": 1.9525137543678284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19195258617401123, "step": 10520 }, { "epoch": 0.21044, "grad_norm": 2.15625, "grad_norm_var": 0.01580988566080729, "learning_rate": 0.0001, "loss": 4.4648, "loss/crossentropy": 2.2391252517700195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23115740716457367, "step": 10522 }, { "epoch": 0.21048, "grad_norm": 2.203125, "grad_norm_var": 0.016727701822916666, "learning_rate": 0.0001, "loss": 4.6467, "loss/crossentropy": 2.550819158554077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2846238762140274, "step": 10524 }, { "epoch": 0.21052, "grad_norm": 2.234375, "grad_norm_var": 0.0176177978515625, "learning_rate": 0.0001, "loss": 4.2787, "loss/crossentropy": 2.181081712245941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23952369391918182, "step": 10526 }, { "epoch": 0.21056, "grad_norm": 1.9921875, "grad_norm_var": 0.01784032185872396, "learning_rate": 0.0001, "loss": 4.1627, "loss/crossentropy": 2.3070446848869324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21255015581846237, "step": 10528 }, { "epoch": 0.2106, "grad_norm": 2.140625, "grad_norm_var": 0.014235178629557291, "learning_rate": 0.0001, "loss": 4.3184, "loss/crossentropy": 1.9560331106185913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19652864336967468, "step": 10530 }, { "epoch": 0.21064, "grad_norm": 2.203125, "grad_norm_var": 0.0127593994140625, "learning_rate": 0.0001, "loss": 4.5728, "loss/crossentropy": 2.2470709085464478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25076867640018463, "step": 10532 }, { "epoch": 0.21068, "grad_norm": 2.03125, "grad_norm_var": 0.011767323811848958, "learning_rate": 0.0001, "loss": 4.2652, "loss/crossentropy": 2.0689820051193237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21815705299377441, "step": 10534 }, { "epoch": 0.21072, "grad_norm": 1.8671875, "grad_norm_var": 0.013499959309895834, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 2.0633797645568848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20865381509065628, "step": 10536 }, { "epoch": 0.21076, "grad_norm": 1.96875, "grad_norm_var": 0.014029947916666667, "learning_rate": 0.0001, "loss": 4.2644, "loss/crossentropy": 2.3332748413085938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21530094742774963, "step": 10538 }, { "epoch": 0.2108, "grad_norm": 2.078125, "grad_norm_var": 0.011942545572916666, "learning_rate": 0.0001, "loss": 4.1452, "loss/crossentropy": 2.093048572540283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23568043112754822, "step": 10540 }, { "epoch": 0.21084, "grad_norm": 2.125, "grad_norm_var": 0.009992472330729167, "learning_rate": 0.0001, "loss": 4.5464, "loss/crossentropy": 2.0059397220611572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.253838449716568, "step": 10542 }, { "epoch": 0.21088, "grad_norm": 2.0625, "grad_norm_var": 0.009284464518229167, "learning_rate": 0.0001, "loss": 4.1417, "loss/crossentropy": 2.278248429298401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23320979624986649, "step": 10544 }, { "epoch": 0.21092, "grad_norm": 2.0625, "grad_norm_var": 0.008540598551432292, "learning_rate": 0.0001, "loss": 4.0951, "loss/crossentropy": 2.063527822494507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21290308982133865, "step": 10546 }, { "epoch": 0.21096, "grad_norm": 1.953125, "grad_norm_var": 0.007511138916015625, "learning_rate": 0.0001, "loss": 4.1456, "loss/crossentropy": 2.092045545578003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21721098572015762, "step": 10548 }, { "epoch": 0.211, "grad_norm": 2.078125, "grad_norm_var": 0.06824111938476562, "learning_rate": 0.0001, "loss": 4.0586, "loss/crossentropy": 1.9876770973205566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21775592118501663, "step": 10550 }, { "epoch": 0.21104, "grad_norm": 2.0625, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 4.2372, "loss/crossentropy": 2.292428970336914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24829304218292236, "step": 10552 }, { "epoch": 0.21108, "grad_norm": 2.015625, "grad_norm_var": 0.06552632649739583, "learning_rate": 0.0001, "loss": 4.3412, "loss/crossentropy": 2.257239043712616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2195405438542366, "step": 10554 }, { "epoch": 0.21112, "grad_norm": 2.171875, "grad_norm_var": 0.06616923014322916, "learning_rate": 0.0001, "loss": 4.404, "loss/crossentropy": 2.1424754858016968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21044345945119858, "step": 10556 }, { "epoch": 0.21116, "grad_norm": 2.15625, "grad_norm_var": 0.06642964680989584, "learning_rate": 0.0001, "loss": 4.2812, "loss/crossentropy": 1.777747094631195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19684316962957382, "step": 10558 }, { "epoch": 0.2112, "grad_norm": 2.015625, "grad_norm_var": 0.06655171712239584, "learning_rate": 0.0001, "loss": 4.1239, "loss/crossentropy": 2.0860772728919983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20560266077518463, "step": 10560 }, { "epoch": 0.21124, "grad_norm": 1.96875, "grad_norm_var": 0.06787007649739583, "learning_rate": 0.0001, "loss": 4.2409, "loss/crossentropy": 1.9594369530677795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015877440571785, "step": 10562 }, { "epoch": 0.21128, "grad_norm": 2.078125, "grad_norm_var": 0.06467692057291667, "learning_rate": 0.0001, "loss": 4.2539, "loss/crossentropy": 2.069046676158905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21409741044044495, "step": 10564 }, { "epoch": 0.21132, "grad_norm": 1.921875, "grad_norm_var": 0.009464263916015625, "learning_rate": 0.0001, "loss": 3.822, "loss/crossentropy": 1.7300589084625244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18024658411741257, "step": 10566 }, { "epoch": 0.21136, "grad_norm": 2.125, "grad_norm_var": 0.0077288309733072914, "learning_rate": 0.0001, "loss": 4.2608, "loss/crossentropy": 2.048615336418152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.228649340569973, "step": 10568 }, { "epoch": 0.2114, "grad_norm": 2.171875, "grad_norm_var": 0.007085927327473958, "learning_rate": 0.0001, "loss": 4.4622, "loss/crossentropy": 1.9678268432617188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20117972791194916, "step": 10570 }, { "epoch": 0.21144, "grad_norm": 2.1875, "grad_norm_var": 0.006534576416015625, "learning_rate": 0.0001, "loss": 4.3919, "loss/crossentropy": 2.595113754272461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24565115571022034, "step": 10572 }, { "epoch": 0.21148, "grad_norm": 2.25, "grad_norm_var": 0.008658599853515626, "learning_rate": 0.0001, "loss": 4.0873, "loss/crossentropy": 2.096100628376007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20913005620241165, "step": 10574 }, { "epoch": 0.21152, "grad_norm": 2.25, "grad_norm_var": 0.012223052978515624, "learning_rate": 0.0001, "loss": 4.181, "loss/crossentropy": 2.0096259713172913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21125900745391846, "step": 10576 }, { "epoch": 0.21156, "grad_norm": 2.046875, "grad_norm_var": 0.011572011311848958, "learning_rate": 0.0001, "loss": 3.9738, "loss/crossentropy": 1.9990533590316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20894166082143784, "step": 10578 }, { "epoch": 0.2116, "grad_norm": 2.0625, "grad_norm_var": 0.013606516520182292, "learning_rate": 0.0001, "loss": 4.0404, "loss/crossentropy": 2.1385116577148438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19753456860780716, "step": 10580 }, { "epoch": 0.21164, "grad_norm": 2.328125, "grad_norm_var": 0.012482706705729167, "learning_rate": 0.0001, "loss": 4.5056, "loss/crossentropy": 2.2081239819526672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23155531287193298, "step": 10582 }, { "epoch": 0.21168, "grad_norm": 2.125, "grad_norm_var": 0.014339192708333334, "learning_rate": 0.0001, "loss": 4.161, "loss/crossentropy": 1.6915860772132874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18060292303562164, "step": 10584 }, { "epoch": 0.21172, "grad_norm": 1.875, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 4.083, "loss/crossentropy": 1.5825872421264648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19387374818325043, "step": 10586 }, { "epoch": 0.21176, "grad_norm": 2.109375, "grad_norm_var": 0.0253082275390625, "learning_rate": 0.0001, "loss": 4.6102, "loss/crossentropy": 2.386876940727234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23594971746206284, "step": 10588 }, { "epoch": 0.2118, "grad_norm": 1.8828125, "grad_norm_var": 0.02759577433268229, "learning_rate": 0.0001, "loss": 4.106, "loss/crossentropy": 2.202653169631958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666631639003754, "step": 10590 }, { "epoch": 0.21184, "grad_norm": 2.046875, "grad_norm_var": 0.026041412353515626, "learning_rate": 0.0001, "loss": 4.2285, "loss/crossentropy": 2.013135075569153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22155165672302246, "step": 10592 }, { "epoch": 0.21188, "grad_norm": 2.25, "grad_norm_var": 2.762861887613932, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 1.6701499223709106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21808167546987534, "step": 10594 }, { "epoch": 0.21192, "grad_norm": 2.015625, "grad_norm_var": 2.7677996317545572, "learning_rate": 0.0001, "loss": 4.1438, "loss/crossentropy": 2.074462592601776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22254129499197006, "step": 10596 }, { "epoch": 0.21196, "grad_norm": 2.203125, "grad_norm_var": 2.787275950113932, "learning_rate": 0.0001, "loss": 4.2329, "loss/crossentropy": 2.182308316230774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21201200038194656, "step": 10598 }, { "epoch": 0.212, "grad_norm": 2.0625, "grad_norm_var": 2.784148915608724, "learning_rate": 0.0001, "loss": 4.0688, "loss/crossentropy": 1.9777602553367615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21454624831676483, "step": 10600 }, { "epoch": 0.21204, "grad_norm": 2.03125, "grad_norm_var": 2.778930409749349, "learning_rate": 0.0001, "loss": 4.2707, "loss/crossentropy": 2.272592306137085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22290532290935516, "step": 10602 }, { "epoch": 0.21208, "grad_norm": 2.078125, "grad_norm_var": 2.784010569254557, "learning_rate": 0.0001, "loss": 4.3406, "loss/crossentropy": 2.110643744468689, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22552277147769928, "step": 10604 }, { "epoch": 0.21212, "grad_norm": 2.125, "grad_norm_var": 2.7753326416015627, "learning_rate": 0.0001, "loss": 4.3485, "loss/crossentropy": 2.279823422431946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22356732934713364, "step": 10606 }, { "epoch": 0.21216, "grad_norm": 2.109375, "grad_norm_var": 2.768701171875, "learning_rate": 0.0001, "loss": 4.3203, "loss/crossentropy": 2.0985517501831055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22894078493118286, "step": 10608 }, { "epoch": 0.2122, "grad_norm": 2.046875, "grad_norm_var": 0.006884765625, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 2.251029133796692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24473578482866287, "step": 10610 }, { "epoch": 0.21224, "grad_norm": 2.140625, "grad_norm_var": 0.005785115559895833, "learning_rate": 0.0001, "loss": 4.2451, "loss/crossentropy": 2.1706892251968384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21725767105817795, "step": 10612 }, { "epoch": 0.21228, "grad_norm": 2.046875, "grad_norm_var": 0.004378255208333333, "learning_rate": 0.0001, "loss": 4.3319, "loss/crossentropy": 2.0709590315818787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22771050035953522, "step": 10614 }, { "epoch": 0.21232, "grad_norm": 1.9609375, "grad_norm_var": 0.005163319905598958, "learning_rate": 0.0001, "loss": 4.2507, "loss/crossentropy": 2.0437510013580322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055979147553444, "step": 10616 }, { "epoch": 0.21236, "grad_norm": 2.140625, "grad_norm_var": 0.005204010009765625, "learning_rate": 0.0001, "loss": 4.2492, "loss/crossentropy": 2.023369252681732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23950359225273132, "step": 10618 }, { "epoch": 0.2124, "grad_norm": 2.140625, "grad_norm_var": 0.004325103759765625, "learning_rate": 0.0001, "loss": 4.3522, "loss/crossentropy": 2.051850199699402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24145089089870453, "step": 10620 }, { "epoch": 0.21244, "grad_norm": 2.015625, "grad_norm_var": 0.0044247945149739586, "learning_rate": 0.0001, "loss": 4.1743, "loss/crossentropy": 1.9617546796798706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2095404863357544, "step": 10622 }, { "epoch": 0.21248, "grad_norm": 2.140625, "grad_norm_var": 0.004662831624348958, "learning_rate": 0.0001, "loss": 4.2115, "loss/crossentropy": 1.9792284965515137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21156969666481018, "step": 10624 }, { "epoch": 0.21252, "grad_norm": 2.046875, "grad_norm_var": 0.004662831624348958, "learning_rate": 0.0001, "loss": 4.4402, "loss/crossentropy": 1.936375379562378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21360263973474503, "step": 10626 }, { "epoch": 0.21256, "grad_norm": 1.859375, "grad_norm_var": 0.0071408589680989586, "learning_rate": 0.0001, "loss": 4.0695, "loss/crossentropy": 2.11286723613739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20090097934007645, "step": 10628 }, { "epoch": 0.2126, "grad_norm": 2.015625, "grad_norm_var": 0.007120513916015625, "learning_rate": 0.0001, "loss": 4.2555, "loss/crossentropy": 2.1020379066467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21139420568943024, "step": 10630 }, { "epoch": 0.21264, "grad_norm": 2.0625, "grad_norm_var": 0.0064453125, "learning_rate": 0.0001, "loss": 4.2027, "loss/crossentropy": 2.220509111881256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22286564111709595, "step": 10632 }, { "epoch": 0.21268, "grad_norm": 1.890625, "grad_norm_var": 0.0080078125, "learning_rate": 0.0001, "loss": 4.0012, "loss/crossentropy": 2.058075189590454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22242067009210587, "step": 10634 }, { "epoch": 0.21272, "grad_norm": 2.09375, "grad_norm_var": 0.005269368489583333, "learning_rate": 0.0001, "loss": 4.1825, "loss/crossentropy": 1.9641217589378357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20702090859413147, "step": 10636 }, { "epoch": 0.21276, "grad_norm": 2.046875, "grad_norm_var": 0.005231730143229167, "learning_rate": 0.0001, "loss": 4.1309, "loss/crossentropy": 1.9234120845794678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20697829127311707, "step": 10638 }, { "epoch": 0.2128, "grad_norm": 1.9375, "grad_norm_var": 0.0067047119140625, "learning_rate": 0.0001, "loss": 4.2551, "loss/crossentropy": 2.1050453782081604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21321691572666168, "step": 10640 }, { "epoch": 0.21284, "grad_norm": 1.8515625, "grad_norm_var": 0.009323883056640624, "learning_rate": 0.0001, "loss": 4.0852, "loss/crossentropy": 2.21867573261261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468330919742584, "step": 10642 }, { "epoch": 0.21288, "grad_norm": 2.09375, "grad_norm_var": 0.007319895426432291, "learning_rate": 0.0001, "loss": 4.4209, "loss/crossentropy": 2.213089942932129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21172764152288437, "step": 10644 }, { "epoch": 0.21292, "grad_norm": 2.109375, "grad_norm_var": 0.012556711832682291, "learning_rate": 0.0001, "loss": 4.3261, "loss/crossentropy": 2.0370752811431885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23427169024944305, "step": 10646 }, { "epoch": 0.21296, "grad_norm": 2.109375, "grad_norm_var": 0.016035715738932293, "learning_rate": 0.0001, "loss": 4.2622, "loss/crossentropy": 2.080373227596283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148713618516922, "step": 10648 }, { "epoch": 0.213, "grad_norm": 2.078125, "grad_norm_var": 0.014134724934895834, "learning_rate": 0.0001, "loss": 4.5408, "loss/crossentropy": 2.3023130893707275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22768481075763702, "step": 10650 }, { "epoch": 0.21304, "grad_norm": 2.203125, "grad_norm_var": 0.016185506184895834, "learning_rate": 0.0001, "loss": 4.372, "loss/crossentropy": 2.14642870426178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22463608533143997, "step": 10652 }, { "epoch": 0.21308, "grad_norm": 2.03125, "grad_norm_var": 0.016377766927083332, "learning_rate": 0.0001, "loss": 4.2164, "loss/crossentropy": 2.2469639778137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505262568593025, "step": 10654 }, { "epoch": 0.21312, "grad_norm": 2.0625, "grad_norm_var": 0.015915679931640624, "learning_rate": 0.0001, "loss": 4.243, "loss/crossentropy": 2.0431448221206665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21013544499874115, "step": 10656 }, { "epoch": 0.21316, "grad_norm": 1.9375, "grad_norm_var": 0.013631184895833334, "learning_rate": 0.0001, "loss": 3.9872, "loss/crossentropy": 1.6768526434898376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18652133643627167, "step": 10658 }, { "epoch": 0.2132, "grad_norm": 2.0, "grad_norm_var": 0.017175038655598957, "learning_rate": 0.0001, "loss": 3.9442, "loss/crossentropy": 1.3748261332511902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15284860879182816, "step": 10660 }, { "epoch": 0.21324, "grad_norm": 2.03125, "grad_norm_var": 0.012282053629557291, "learning_rate": 0.0001, "loss": 4.1451, "loss/crossentropy": 1.9126732349395752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21141140908002853, "step": 10662 }, { "epoch": 0.21328, "grad_norm": 2.1875, "grad_norm_var": 0.010908762613932291, "learning_rate": 0.0001, "loss": 4.2392, "loss/crossentropy": 1.97357976436615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21326088905334473, "step": 10664 }, { "epoch": 0.21332, "grad_norm": 2.109375, "grad_norm_var": 0.011201985677083333, "learning_rate": 0.0001, "loss": 4.2825, "loss/crossentropy": 2.1782814860343933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21183249354362488, "step": 10666 }, { "epoch": 0.21336, "grad_norm": 1.9609375, "grad_norm_var": 0.009098307291666666, "learning_rate": 0.0001, "loss": 4.1251, "loss/crossentropy": 2.1700649857521057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2310282066464424, "step": 10668 }, { "epoch": 0.2134, "grad_norm": 2.109375, "grad_norm_var": 0.0087554931640625, "learning_rate": 0.0001, "loss": 4.479, "loss/crossentropy": 2.249666213989258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21025578677654266, "step": 10670 }, { "epoch": 0.21344, "grad_norm": 2.15625, "grad_norm_var": 0.009287261962890625, "learning_rate": 0.0001, "loss": 4.3445, "loss/crossentropy": 1.993752121925354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20829375833272934, "step": 10672 }, { "epoch": 0.21348, "grad_norm": 2.046875, "grad_norm_var": 0.009244791666666667, "learning_rate": 0.0001, "loss": 4.0814, "loss/crossentropy": 2.0472013354301453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21456149220466614, "step": 10674 }, { "epoch": 0.21352, "grad_norm": 2.109375, "grad_norm_var": 0.007222239176432292, "learning_rate": 0.0001, "loss": 3.9497, "loss/crossentropy": 2.0347819328308105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133965790271759, "step": 10676 }, { "epoch": 0.21356, "grad_norm": 1.984375, "grad_norm_var": 0.008168284098307292, "learning_rate": 0.0001, "loss": 4.1528, "loss/crossentropy": 1.86410254240036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22139018028974533, "step": 10678 }, { "epoch": 0.2136, "grad_norm": 2.015625, "grad_norm_var": 0.005641428629557291, "learning_rate": 0.0001, "loss": 4.3134, "loss/crossentropy": 2.3244482278823853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24854417145252228, "step": 10680 }, { "epoch": 0.21364, "grad_norm": 2.0, "grad_norm_var": 0.006304677327473958, "learning_rate": 0.0001, "loss": 4.7182, "loss/crossentropy": 2.518718123435974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.274397537112236, "step": 10682 }, { "epoch": 0.21368, "grad_norm": 2.09375, "grad_norm_var": 0.005928548177083334, "learning_rate": 0.0001, "loss": 4.1289, "loss/crossentropy": 1.8111079931259155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039540931582451, "step": 10684 }, { "epoch": 0.21372, "grad_norm": 1.8828125, "grad_norm_var": 0.008129628499348958, "learning_rate": 0.0001, "loss": 4.3446, "loss/crossentropy": 2.186485230922699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22497782111167908, "step": 10686 }, { "epoch": 0.21376, "grad_norm": 2.21875, "grad_norm_var": 0.009877268473307292, "learning_rate": 0.0001, "loss": 4.4708, "loss/crossentropy": 2.1850993633270264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22541093826293945, "step": 10688 }, { "epoch": 0.2138, "grad_norm": 2.078125, "grad_norm_var": 0.0090576171875, "learning_rate": 0.0001, "loss": 4.3991, "loss/crossentropy": 1.9756001830101013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23400776088237762, "step": 10690 }, { "epoch": 0.21384, "grad_norm": 2.125, "grad_norm_var": 0.010652669270833333, "learning_rate": 0.0001, "loss": 4.0283, "loss/crossentropy": 1.7454423904418945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18263398110866547, "step": 10692 }, { "epoch": 0.21388, "grad_norm": 2.015625, "grad_norm_var": 0.010066731770833334, "learning_rate": 0.0001, "loss": 3.9364, "loss/crossentropy": 1.5824024081230164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18881092965602875, "step": 10694 }, { "epoch": 0.21392, "grad_norm": 2.15625, "grad_norm_var": 0.009383138020833333, "learning_rate": 0.0001, "loss": 4.3587, "loss/crossentropy": 2.1171644926071167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2173115462064743, "step": 10696 }, { "epoch": 0.21396, "grad_norm": 2.0625, "grad_norm_var": 0.008698527018229167, "learning_rate": 0.0001, "loss": 4.2102, "loss/crossentropy": 1.9327389001846313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22025491297245026, "step": 10698 }, { "epoch": 0.214, "grad_norm": 1.90625, "grad_norm_var": 0.009952799479166666, "learning_rate": 0.0001, "loss": 3.9622, "loss/crossentropy": 1.9806901216506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2089216560125351, "step": 10700 }, { "epoch": 0.21404, "grad_norm": 2.109375, "grad_norm_var": 0.007503000895182291, "learning_rate": 0.0001, "loss": 4.2503, "loss/crossentropy": 2.216805338859558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21268506348133087, "step": 10702 }, { "epoch": 0.21408, "grad_norm": 1.9765625, "grad_norm_var": 0.0055084228515625, "learning_rate": 0.0001, "loss": 4.5121, "loss/crossentropy": 2.3998383283615112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26092807948589325, "step": 10704 }, { "epoch": 0.21412, "grad_norm": 2.171875, "grad_norm_var": 0.007869466145833334, "learning_rate": 0.0001, "loss": 4.4273, "loss/crossentropy": 2.0581844449043274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2316882163286209, "step": 10706 }, { "epoch": 0.21416, "grad_norm": 2.09375, "grad_norm_var": 0.006258138020833333, "learning_rate": 0.0001, "loss": 4.2684, "loss/crossentropy": 2.3091371059417725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538943380117416, "step": 10708 }, { "epoch": 0.2142, "grad_norm": 1.9921875, "grad_norm_var": 0.0067291259765625, "learning_rate": 0.0001, "loss": 3.9567, "loss/crossentropy": 1.7134324312210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1941806674003601, "step": 10710 }, { "epoch": 0.21424, "grad_norm": 2.078125, "grad_norm_var": 0.0060699462890625, "learning_rate": 0.0001, "loss": 4.3165, "loss/crossentropy": 2.2040648460388184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20669078081846237, "step": 10712 }, { "epoch": 0.21428, "grad_norm": 2.25, "grad_norm_var": 0.020612589518229165, "learning_rate": 0.0001, "loss": 4.3207, "loss/crossentropy": 2.3040376901626587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20685256272554398, "step": 10714 }, { "epoch": 0.21432, "grad_norm": 2.328125, "grad_norm_var": 0.020318349202473957, "learning_rate": 0.0001, "loss": 4.2674, "loss/crossentropy": 2.189309239387512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21499691903591156, "step": 10716 }, { "epoch": 0.21436, "grad_norm": 2.1875, "grad_norm_var": 0.020216623942057293, "learning_rate": 0.0001, "loss": 4.4144, "loss/crossentropy": 2.1955957412719727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20932748913764954, "step": 10718 }, { "epoch": 0.2144, "grad_norm": 2.140625, "grad_norm_var": 0.020271809895833333, "learning_rate": 0.0001, "loss": 4.2135, "loss/crossentropy": 1.7800896763801575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1939619928598404, "step": 10720 }, { "epoch": 0.21444, "grad_norm": 2.15625, "grad_norm_var": 0.020335896809895834, "learning_rate": 0.0001, "loss": 4.2757, "loss/crossentropy": 1.8974847197532654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19434937089681625, "step": 10722 }, { "epoch": 0.21448, "grad_norm": 2.171875, "grad_norm_var": 0.021455891927083335, "learning_rate": 0.0001, "loss": 4.2688, "loss/crossentropy": 2.1631242632865906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21765826642513275, "step": 10724 }, { "epoch": 0.21452, "grad_norm": 2.03125, "grad_norm_var": 0.0203521728515625, "learning_rate": 0.0001, "loss": 4.6421, "loss/crossentropy": 2.241260290145874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2334459200501442, "step": 10726 }, { "epoch": 0.21456, "grad_norm": 2.0625, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 3.9478, "loss/crossentropy": 1.9063156247138977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22118454426527023, "step": 10728 }, { "epoch": 0.2146, "grad_norm": 2.03125, "grad_norm_var": 0.011310831705729166, "learning_rate": 0.0001, "loss": 4.0446, "loss/crossentropy": 1.8109349012374878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18581955134868622, "step": 10730 }, { "epoch": 0.21464, "grad_norm": 2.265625, "grad_norm_var": 0.009749348958333333, "learning_rate": 0.0001, "loss": 4.3262, "loss/crossentropy": 2.1113163232803345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21339301764965057, "step": 10732 }, { "epoch": 0.21468, "grad_norm": 2.0625, "grad_norm_var": 0.0108551025390625, "learning_rate": 0.0001, "loss": 4.5308, "loss/crossentropy": 2.1731717586517334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22267260402441025, "step": 10734 }, { "epoch": 0.21472, "grad_norm": 2.078125, "grad_norm_var": 0.010087076822916667, "learning_rate": 0.0001, "loss": 4.2363, "loss/crossentropy": 2.322808027267456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21375543624162674, "step": 10736 }, { "epoch": 0.21476, "grad_norm": 2.0, "grad_norm_var": 0.010358683268229167, "learning_rate": 0.0001, "loss": 4.109, "loss/crossentropy": 1.8200489282608032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19796901941299438, "step": 10738 }, { "epoch": 0.2148, "grad_norm": 2.0625, "grad_norm_var": 0.009455362955729166, "learning_rate": 0.0001, "loss": 4.0918, "loss/crossentropy": 2.353461265563965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22755976021289825, "step": 10740 }, { "epoch": 0.21484, "grad_norm": 2.015625, "grad_norm_var": 0.0071451822916666664, "learning_rate": 0.0001, "loss": 4.2925, "loss/crossentropy": 2.3200663328170776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23450587689876556, "step": 10742 }, { "epoch": 0.21488, "grad_norm": 2.0, "grad_norm_var": 0.006403605143229167, "learning_rate": 0.0001, "loss": 4.187, "loss/crossentropy": 1.7293912768363953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17938270419836044, "step": 10744 }, { "epoch": 0.21492, "grad_norm": 1.8828125, "grad_norm_var": 0.008656565348307292, "learning_rate": 0.0001, "loss": 4.3213, "loss/crossentropy": 1.9759944081306458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20631389319896698, "step": 10746 }, { "epoch": 0.21496, "grad_norm": 2.1875, "grad_norm_var": 0.006870269775390625, "learning_rate": 0.0001, "loss": 4.3658, "loss/crossentropy": 2.2232764959335327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21919700503349304, "step": 10748 }, { "epoch": 0.215, "grad_norm": 1.953125, "grad_norm_var": 0.004898834228515625, "learning_rate": 0.0001, "loss": 4.2466, "loss/crossentropy": 2.0827722549438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20569747686386108, "step": 10750 }, { "epoch": 0.21504, "grad_norm": 1.9765625, "grad_norm_var": 0.005509440104166667, "learning_rate": 0.0001, "loss": 4.2373, "loss/crossentropy": 2.0712032318115234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20504066348075867, "step": 10752 }, { "epoch": 0.21508, "grad_norm": 2.234375, "grad_norm_var": 0.007963053385416667, "learning_rate": 0.0001, "loss": 4.3466, "loss/crossentropy": 2.2717082500457764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22957370430231094, "step": 10754 }, { "epoch": 0.21512, "grad_norm": 2.359375, "grad_norm_var": 0.0133056640625, "learning_rate": 0.0001, "loss": 4.2821, "loss/crossentropy": 2.263810157775879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2291572391986847, "step": 10756 }, { "epoch": 0.21516, "grad_norm": 1.9921875, "grad_norm_var": 0.013724517822265626, "learning_rate": 0.0001, "loss": 4.2157, "loss/crossentropy": 2.103231191635132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21839886158704758, "step": 10758 }, { "epoch": 0.2152, "grad_norm": 2.203125, "grad_norm_var": 0.013962554931640624, "learning_rate": 0.0001, "loss": 4.4772, "loss/crossentropy": 2.1272148489952087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280830293893814, "step": 10760 }, { "epoch": 0.21524, "grad_norm": 2.078125, "grad_norm_var": 0.012962849934895833, "learning_rate": 0.0001, "loss": 4.1192, "loss/crossentropy": 2.133938789367676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2162996605038643, "step": 10762 }, { "epoch": 0.21528, "grad_norm": 2.828125, "grad_norm_var": 0.04528401692708333, "learning_rate": 0.0001, "loss": 4.5351, "loss/crossentropy": 2.305683732032776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797836154699326, "step": 10764 }, { "epoch": 0.21532, "grad_norm": 2.078125, "grad_norm_var": 0.04370829264322917, "learning_rate": 0.0001, "loss": 3.9886, "loss/crossentropy": 1.7248413562774658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17226764559745789, "step": 10766 }, { "epoch": 0.21536, "grad_norm": 2.1875, "grad_norm_var": 0.041715240478515624, "learning_rate": 0.0001, "loss": 4.6422, "loss/crossentropy": 2.275644540786743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2433001920580864, "step": 10768 }, { "epoch": 0.2154, "grad_norm": 2.1875, "grad_norm_var": 0.043338775634765625, "learning_rate": 0.0001, "loss": 4.2836, "loss/crossentropy": 1.908457100391388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19758973270654678, "step": 10770 }, { "epoch": 0.21544, "grad_norm": 1.8984375, "grad_norm_var": 0.07423909505208333, "learning_rate": 0.0001, "loss": 4.2317, "loss/crossentropy": 1.9401238560676575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22177566587924957, "step": 10772 }, { "epoch": 0.21548, "grad_norm": 2.046875, "grad_norm_var": 0.07476170857747395, "learning_rate": 0.0001, "loss": 4.295, "loss/crossentropy": 2.247257351875305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21560464799404144, "step": 10774 }, { "epoch": 0.21552, "grad_norm": 2.015625, "grad_norm_var": 0.07644424438476563, "learning_rate": 0.0001, "loss": 4.1482, "loss/crossentropy": 1.9191248416900635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214834064245224, "step": 10776 }, { "epoch": 0.21556, "grad_norm": 2.0625, "grad_norm_var": 0.0785888671875, "learning_rate": 0.0001, "loss": 4.0644, "loss/crossentropy": 1.8963102102279663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20552606880664825, "step": 10778 }, { "epoch": 0.2156, "grad_norm": 2.015625, "grad_norm_var": 0.047078450520833336, "learning_rate": 0.0001, "loss": 4.265, "loss/crossentropy": 2.0517951250076294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963544860482216, "step": 10780 }, { "epoch": 0.21564, "grad_norm": 2.171875, "grad_norm_var": 0.046727498372395836, "learning_rate": 0.0001, "loss": 4.3571, "loss/crossentropy": 2.1484888792037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23963741213083267, "step": 10782 }, { "epoch": 0.21568, "grad_norm": 2.671875, "grad_norm_var": 0.06716206868489584, "learning_rate": 0.0001, "loss": 4.6827, "loss/crossentropy": 2.012593388557434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115195021033287, "step": 10784 }, { "epoch": 0.21572, "grad_norm": 2.03125, "grad_norm_var": 0.06651102701822917, "learning_rate": 0.0001, "loss": 4.018, "loss/crossentropy": 2.139856696128845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23609659075737, "step": 10786 }, { "epoch": 0.21576, "grad_norm": 2.25, "grad_norm_var": 0.03216120402018229, "learning_rate": 0.0001, "loss": 4.5007, "loss/crossentropy": 2.0139951705932617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22799161076545715, "step": 10788 }, { "epoch": 0.2158, "grad_norm": 2.125, "grad_norm_var": 0.035676829020182294, "learning_rate": 0.0001, "loss": 4.227, "loss/crossentropy": 1.8325074315071106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18156912177801132, "step": 10790 }, { "epoch": 0.21584, "grad_norm": 2.109375, "grad_norm_var": 0.04155654907226562, "learning_rate": 0.0001, "loss": 3.8042, "loss/crossentropy": 1.7531892657279968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18498936295509338, "step": 10792 }, { "epoch": 0.21588, "grad_norm": 2.25, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 4.3955, "loss/crossentropy": 2.09742671251297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21543975174427032, "step": 10794 }, { "epoch": 0.21592, "grad_norm": 2.09375, "grad_norm_var": 0.03532613118489583, "learning_rate": 0.0001, "loss": 4.4752, "loss/crossentropy": 2.4303117990493774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24365779757499695, "step": 10796 }, { "epoch": 0.21596, "grad_norm": 2.1875, "grad_norm_var": 0.0371002197265625, "learning_rate": 0.0001, "loss": 4.1973, "loss/crossentropy": 2.007950007915497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20470578223466873, "step": 10798 }, { "epoch": 0.216, "grad_norm": 2.5, "grad_norm_var": 0.026949055989583335, "learning_rate": 0.0001, "loss": 3.9975, "loss/crossentropy": 1.895507276058197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005787119269371, "step": 10800 }, { "epoch": 0.21604, "grad_norm": 2.25, "grad_norm_var": 0.026691691080729166, "learning_rate": 0.0001, "loss": 4.2543, "loss/crossentropy": 2.174374043941498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2282470539212227, "step": 10802 }, { "epoch": 0.21608, "grad_norm": 1.9921875, "grad_norm_var": 0.02789484659830729, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 1.8878389596939087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19730794429779053, "step": 10804 }, { "epoch": 0.21612, "grad_norm": 2.203125, "grad_norm_var": 0.023884073893229166, "learning_rate": 0.0001, "loss": 4.1733, "loss/crossentropy": 1.9874022006988525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21521702408790588, "step": 10806 }, { "epoch": 0.21616, "grad_norm": 1.9921875, "grad_norm_var": 0.01962458292643229, "learning_rate": 0.0001, "loss": 4.0783, "loss/crossentropy": 2.0173474550247192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21552105993032455, "step": 10808 }, { "epoch": 0.2162, "grad_norm": 2.140625, "grad_norm_var": 0.018344879150390625, "learning_rate": 0.0001, "loss": 4.0583, "loss/crossentropy": 2.175139367580414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20279064774513245, "step": 10810 }, { "epoch": 0.21624, "grad_norm": 2.390625, "grad_norm_var": 0.022849273681640626, "learning_rate": 0.0001, "loss": 4.5125, "loss/crossentropy": 2.6407864093780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2939087897539139, "step": 10812 }, { "epoch": 0.21628, "grad_norm": 2.09375, "grad_norm_var": 0.024930826822916665, "learning_rate": 0.0001, "loss": 4.0869, "loss/crossentropy": 1.947974681854248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102196291089058, "step": 10814 }, { "epoch": 0.21632, "grad_norm": 9.875, "grad_norm_var": 3.7874745686848956, "learning_rate": 0.0001, "loss": 4.3404, "loss/crossentropy": 1.9040276408195496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20579400658607483, "step": 10816 }, { "epoch": 0.21636, "grad_norm": 2.21875, "grad_norm_var": 3.7853190104166665, "learning_rate": 0.0001, "loss": 3.7119, "loss/crossentropy": 1.7941421270370483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19792088121175766, "step": 10818 }, { "epoch": 0.2164, "grad_norm": 1.953125, "grad_norm_var": 3.7839088439941406, "learning_rate": 0.0001, "loss": 4.2573, "loss/crossentropy": 2.2164441347122192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2903618812561035, "step": 10820 }, { "epoch": 0.21644, "grad_norm": 2.109375, "grad_norm_var": 3.77445068359375, "learning_rate": 0.0001, "loss": 4.2537, "loss/crossentropy": 1.9888933897018433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20800678431987762, "step": 10822 }, { "epoch": 0.21648, "grad_norm": 1.875, "grad_norm_var": 3.79569091796875, "learning_rate": 0.0001, "loss": 3.9218, "loss/crossentropy": 1.9026559591293335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18179579824209213, "step": 10824 }, { "epoch": 0.21652, "grad_norm": 2.078125, "grad_norm_var": 3.7974202473958334, "learning_rate": 0.0001, "loss": 4.1267, "loss/crossentropy": 1.9020920991897583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20755057036876678, "step": 10826 }, { "epoch": 0.21656, "grad_norm": 2.03125, "grad_norm_var": 3.8177286783854165, "learning_rate": 0.0001, "loss": 4.4689, "loss/crossentropy": 2.3464537858963013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23209182918071747, "step": 10828 }, { "epoch": 0.2166, "grad_norm": 2.3125, "grad_norm_var": 3.7860877990722654, "learning_rate": 0.0001, "loss": 4.4486, "loss/crossentropy": 2.305312991142273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2548810988664627, "step": 10830 }, { "epoch": 0.21664, "grad_norm": 2.515625, "grad_norm_var": 0.026775868733723958, "learning_rate": 0.0001, "loss": 3.9818, "loss/crossentropy": 1.4548576474189758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1718049794435501, "step": 10832 }, { "epoch": 0.21668, "grad_norm": 2.1875, "grad_norm_var": 0.02504247029622396, "learning_rate": 0.0001, "loss": 4.4909, "loss/crossentropy": 2.2954181432724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22977370768785477, "step": 10834 }, { "epoch": 0.21672, "grad_norm": 2.09375, "grad_norm_var": 0.022930653889973958, "learning_rate": 0.0001, "loss": 4.0433, "loss/crossentropy": 2.1974023580551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474600583314896, "step": 10836 }, { "epoch": 0.21676, "grad_norm": 2.109375, "grad_norm_var": 0.02304865519205729, "learning_rate": 0.0001, "loss": 4.5905, "loss/crossentropy": 2.1211158633232117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19471213221549988, "step": 10838 }, { "epoch": 0.2168, "grad_norm": 1.984375, "grad_norm_var": 0.0204010009765625, "learning_rate": 0.0001, "loss": 4.0868, "loss/crossentropy": 1.9942908883094788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899253949522972, "step": 10840 }, { "epoch": 0.21684, "grad_norm": 1.9296875, "grad_norm_var": 0.023656209309895832, "learning_rate": 0.0001, "loss": 4.0787, "loss/crossentropy": 1.9848283529281616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777423620223999, "step": 10842 }, { "epoch": 0.21688, "grad_norm": 1.9296875, "grad_norm_var": 0.02545140584309896, "learning_rate": 0.0001, "loss": 4.1365, "loss/crossentropy": 1.84994775056839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19193057715892792, "step": 10844 }, { "epoch": 0.21692, "grad_norm": 2.09375, "grad_norm_var": 0.02163670857747396, "learning_rate": 0.0001, "loss": 4.2869, "loss/crossentropy": 1.8300130367279053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19747256487607956, "step": 10846 }, { "epoch": 0.21696, "grad_norm": 2.234375, "grad_norm_var": 0.010676829020182292, "learning_rate": 0.0001, "loss": 4.1624, "loss/crossentropy": 1.9316250681877136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22893256694078445, "step": 10848 }, { "epoch": 0.217, "grad_norm": 2.109375, "grad_norm_var": 0.009085845947265626, "learning_rate": 0.0001, "loss": 4.0294, "loss/crossentropy": 1.9862067103385925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22470611333847046, "step": 10850 }, { "epoch": 0.21704, "grad_norm": 2.140625, "grad_norm_var": 0.009372711181640625, "learning_rate": 0.0001, "loss": 4.3979, "loss/crossentropy": 2.196234107017517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22592387348413467, "step": 10852 }, { "epoch": 0.21708, "grad_norm": 2.328125, "grad_norm_var": 0.2918291727701823, "learning_rate": 0.0001, "loss": 4.5516, "loss/crossentropy": 2.3097496032714844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29694322496652603, "step": 10854 }, { "epoch": 0.21712, "grad_norm": 1.984375, "grad_norm_var": 0.2865191141764323, "learning_rate": 0.0001, "loss": 3.8687, "loss/crossentropy": 1.883777916431427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21657781302928925, "step": 10856 }, { "epoch": 0.21716, "grad_norm": 2.25, "grad_norm_var": 0.2916338602701823, "learning_rate": 0.0001, "loss": 4.1915, "loss/crossentropy": 2.0483964681625366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22432015091180801, "step": 10858 }, { "epoch": 0.2172, "grad_norm": 2.140625, "grad_norm_var": 0.2870839436848958, "learning_rate": 0.0001, "loss": 4.1503, "loss/crossentropy": 2.2700769901275635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21760191768407822, "step": 10860 }, { "epoch": 0.21724, "grad_norm": 2.03125, "grad_norm_var": 0.2878214518229167, "learning_rate": 0.0001, "loss": 4.574, "loss/crossentropy": 2.3758704662323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792180836200714, "step": 10862 }, { "epoch": 0.21728, "grad_norm": 2.03125, "grad_norm_var": 0.2934641520182292, "learning_rate": 0.0001, "loss": 4.2895, "loss/crossentropy": 2.1972473859786987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22460343688726425, "step": 10864 }, { "epoch": 0.21732, "grad_norm": 2.171875, "grad_norm_var": 0.29321187337239585, "learning_rate": 0.0001, "loss": 4.3693, "loss/crossentropy": 1.9811919331550598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23008134216070175, "step": 10866 }, { "epoch": 0.21736, "grad_norm": 2.03125, "grad_norm_var": 0.2992327372233073, "learning_rate": 0.0001, "loss": 4.2237, "loss/crossentropy": 2.032214403152466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958027482032776, "step": 10868 }, { "epoch": 0.2174, "grad_norm": 2.09375, "grad_norm_var": 0.035162099202473956, "learning_rate": 0.0001, "loss": 4.0956, "loss/crossentropy": 2.08488667011261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21948565542697906, "step": 10870 }, { "epoch": 0.21744, "grad_norm": 2.109375, "grad_norm_var": 0.03401260375976563, "learning_rate": 0.0001, "loss": 4.2923, "loss/crossentropy": 2.304922103881836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977907121181488, "step": 10872 }, { "epoch": 0.21748, "grad_norm": 2.03125, "grad_norm_var": 0.005582427978515625, "learning_rate": 0.0001, "loss": 4.1762, "loss/crossentropy": 1.991280436515808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2196018323302269, "step": 10874 }, { "epoch": 0.21752, "grad_norm": 2.09375, "grad_norm_var": 0.004965972900390625, "learning_rate": 0.0001, "loss": 4.4116, "loss/crossentropy": 1.955183207988739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21059879660606384, "step": 10876 }, { "epoch": 0.21756, "grad_norm": 2.140625, "grad_norm_var": 0.0045562744140625, "learning_rate": 0.0001, "loss": 4.2105, "loss/crossentropy": 2.1443604230880737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23160522431135178, "step": 10878 }, { "epoch": 0.2176, "grad_norm": 2.140625, "grad_norm_var": 0.004889933268229166, "learning_rate": 0.0001, "loss": 4.2374, "loss/crossentropy": 2.0859211683273315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2296902909874916, "step": 10880 }, { "epoch": 0.21764, "grad_norm": 2.0, "grad_norm_var": 0.0059234619140625, "learning_rate": 0.0001, "loss": 4.3303, "loss/crossentropy": 2.220987915992737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22863281518220901, "step": 10882 }, { "epoch": 0.21768, "grad_norm": 1.96875, "grad_norm_var": 0.005625152587890625, "learning_rate": 0.0001, "loss": 4.2195, "loss/crossentropy": 2.1172574758529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197882778942585, "step": 10884 }, { "epoch": 0.21772, "grad_norm": 2.25, "grad_norm_var": 0.02047704060872396, "learning_rate": 0.0001, "loss": 4.3907, "loss/crossentropy": 1.7861940264701843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984492540359497, "step": 10886 }, { "epoch": 0.21776, "grad_norm": 1.9140625, "grad_norm_var": 0.0218414306640625, "learning_rate": 0.0001, "loss": 4.0445, "loss/crossentropy": 2.1119033098220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20427606999874115, "step": 10888 }, { "epoch": 0.2178, "grad_norm": 2.15625, "grad_norm_var": 0.02235692342122396, "learning_rate": 0.0001, "loss": 3.923, "loss/crossentropy": 1.7849717140197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20817570388317108, "step": 10890 }, { "epoch": 0.21784, "grad_norm": 2.03125, "grad_norm_var": 0.025131988525390624, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.0543535351753235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20579595863819122, "step": 10892 }, { "epoch": 0.21788, "grad_norm": 1.8984375, "grad_norm_var": 0.02643000284830729, "learning_rate": 0.0001, "loss": 4.1527, "loss/crossentropy": 2.085465431213379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23541826009750366, "step": 10894 }, { "epoch": 0.21792, "grad_norm": 2.40625, "grad_norm_var": 0.03343073527018229, "learning_rate": 0.0001, "loss": 4.3288, "loss/crossentropy": 1.9521069526672363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21404746174812317, "step": 10896 }, { "epoch": 0.21796, "grad_norm": 1.96875, "grad_norm_var": 0.03172378540039063, "learning_rate": 0.0001, "loss": 4.039, "loss/crossentropy": 1.8710272908210754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20500759035348892, "step": 10898 }, { "epoch": 0.218, "grad_norm": 2.0625, "grad_norm_var": 0.032083892822265626, "learning_rate": 0.0001, "loss": 4.0695, "loss/crossentropy": 2.275243639945984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24073244631290436, "step": 10900 }, { "epoch": 0.21804, "grad_norm": 2.109375, "grad_norm_var": 0.0181060791015625, "learning_rate": 0.0001, "loss": 4.3508, "loss/crossentropy": 2.2093106508255005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23254899680614471, "step": 10902 }, { "epoch": 0.21808, "grad_norm": 2.0, "grad_norm_var": 0.01702855428059896, "learning_rate": 0.0001, "loss": 4.2805, "loss/crossentropy": 2.0315810441970825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614907145500183, "step": 10904 }, { "epoch": 0.21812, "grad_norm": 2.015625, "grad_norm_var": 0.016478474934895834, "learning_rate": 0.0001, "loss": 4.3461, "loss/crossentropy": 2.0397735834121704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093086987733841, "step": 10906 }, { "epoch": 0.21816, "grad_norm": 2.03125, "grad_norm_var": 0.013508097330729166, "learning_rate": 0.0001, "loss": 4.0286, "loss/crossentropy": 1.9616519808769226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19433944672346115, "step": 10908 }, { "epoch": 0.2182, "grad_norm": 2.25, "grad_norm_var": 0.01749445597330729, "learning_rate": 0.0001, "loss": 4.2157, "loss/crossentropy": 2.1907248497009277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23218485713005066, "step": 10910 }, { "epoch": 0.21824, "grad_norm": 2.0625, "grad_norm_var": 0.009981282552083333, "learning_rate": 0.0001, "loss": 4.163, "loss/crossentropy": 2.2785152196884155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21416624635457993, "step": 10912 }, { "epoch": 0.21828, "grad_norm": 2.03125, "grad_norm_var": 0.00955810546875, "learning_rate": 0.0001, "loss": 4.2528, "loss/crossentropy": 2.151498794555664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23632780462503433, "step": 10914 }, { "epoch": 0.21832, "grad_norm": 2.03125, "grad_norm_var": 0.008829752604166666, "learning_rate": 0.0001, "loss": 4.3937, "loss/crossentropy": 2.816411852836609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243700549006462, "step": 10916 }, { "epoch": 0.21836, "grad_norm": 2.359375, "grad_norm_var": 0.014662424723307291, "learning_rate": 0.0001, "loss": 4.4091, "loss/crossentropy": 2.2203346490859985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22302204370498657, "step": 10918 }, { "epoch": 0.2184, "grad_norm": 2.0, "grad_norm_var": 0.014662424723307291, "learning_rate": 0.0001, "loss": 4.3534, "loss/crossentropy": 2.0416316390037537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20798437297344208, "step": 10920 }, { "epoch": 0.21844, "grad_norm": 2.078125, "grad_norm_var": 0.014426422119140626, "learning_rate": 0.0001, "loss": 4.1014, "loss/crossentropy": 2.2251007556915283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149331197142601, "step": 10922 }, { "epoch": 0.21848, "grad_norm": 2.0, "grad_norm_var": 0.014631907145182291, "learning_rate": 0.0001, "loss": 4.2346, "loss/crossentropy": 1.976640522480011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21704353392124176, "step": 10924 }, { "epoch": 0.21852, "grad_norm": 2.0625, "grad_norm_var": 0.011006418863932292, "learning_rate": 0.0001, "loss": 4.3284, "loss/crossentropy": 2.041864037513733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19621634483337402, "step": 10926 }, { "epoch": 0.21856, "grad_norm": 2.125, "grad_norm_var": 0.009696451822916667, "learning_rate": 0.0001, "loss": 4.241, "loss/crossentropy": 1.7188060879707336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992729976773262, "step": 10928 }, { "epoch": 0.2186, "grad_norm": 2.140625, "grad_norm_var": 0.010512034098307291, "learning_rate": 0.0001, "loss": 4.0333, "loss/crossentropy": 1.664458692073822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20099520683288574, "step": 10930 }, { "epoch": 0.21864, "grad_norm": 2.015625, "grad_norm_var": 0.011161041259765626, "learning_rate": 0.0001, "loss": 4.1952, "loss/crossentropy": 2.199326276779175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23000852018594742, "step": 10932 }, { "epoch": 0.21868, "grad_norm": 2.015625, "grad_norm_var": 0.005694325764973958, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.980049967765808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211422473192215, "step": 10934 }, { "epoch": 0.21872, "grad_norm": 2.09375, "grad_norm_var": 0.005411529541015625, "learning_rate": 0.0001, "loss": 4.0876, "loss/crossentropy": 1.7884072661399841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203270323574543, "step": 10936 }, { "epoch": 0.21876, "grad_norm": 2.0, "grad_norm_var": 0.005527496337890625, "learning_rate": 0.0001, "loss": 4.1923, "loss/crossentropy": 2.1758522987365723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21960537880659103, "step": 10938 }, { "epoch": 0.2188, "grad_norm": 2.171875, "grad_norm_var": 0.004416656494140625, "learning_rate": 0.0001, "loss": 4.1708, "loss/crossentropy": 2.255508303642273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171120047569275, "step": 10940 }, { "epoch": 0.21884, "grad_norm": 2.0, "grad_norm_var": 0.004308827718098958, "learning_rate": 0.0001, "loss": 4.1294, "loss/crossentropy": 1.9758012890815735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19945884495973587, "step": 10942 }, { "epoch": 0.21888, "grad_norm": 1.9453125, "grad_norm_var": 0.0055582682291666664, "learning_rate": 0.0001, "loss": 4.2122, "loss/crossentropy": 2.2980172634124756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20864219218492508, "step": 10944 }, { "epoch": 0.21892, "grad_norm": 2.09375, "grad_norm_var": 0.006666819254557292, "learning_rate": 0.0001, "loss": 3.919, "loss/crossentropy": 1.5307916402816772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799665167927742, "step": 10946 }, { "epoch": 0.21896, "grad_norm": 2.0625, "grad_norm_var": 0.006648508707682291, "learning_rate": 0.0001, "loss": 4.3796, "loss/crossentropy": 2.3332602977752686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22964681684970856, "step": 10948 }, { "epoch": 0.219, "grad_norm": 2.078125, "grad_norm_var": 0.007380930582682291, "learning_rate": 0.0001, "loss": 4.1207, "loss/crossentropy": 2.2509007453918457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22141733020544052, "step": 10950 }, { "epoch": 0.21904, "grad_norm": 2.1875, "grad_norm_var": 0.008949534098307291, "learning_rate": 0.0001, "loss": 4.4879, "loss/crossentropy": 2.3677018880844116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25676336884498596, "step": 10952 }, { "epoch": 0.21908, "grad_norm": 2.21875, "grad_norm_var": 0.009934234619140624, "learning_rate": 0.0001, "loss": 4.4283, "loss/crossentropy": 2.2807843685150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21486903727054596, "step": 10954 }, { "epoch": 0.21912, "grad_norm": 2.078125, "grad_norm_var": 0.009232330322265624, "learning_rate": 0.0001, "loss": 4.3126, "loss/crossentropy": 2.3883347511291504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21362561732530594, "step": 10956 }, { "epoch": 0.21916, "grad_norm": 2.0625, "grad_norm_var": 0.009405263264973958, "learning_rate": 0.0001, "loss": 4.0692, "loss/crossentropy": 1.8598107695579529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19806977361440659, "step": 10958 }, { "epoch": 0.2192, "grad_norm": 2.140625, "grad_norm_var": 0.007738240559895833, "learning_rate": 0.0001, "loss": 4.3259, "loss/crossentropy": 2.1962249875068665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20640095323324203, "step": 10960 }, { "epoch": 0.21924, "grad_norm": 2.359375, "grad_norm_var": 0.007958984375, "learning_rate": 0.0001, "loss": 4.1774, "loss/crossentropy": 2.061887502670288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22463876008987427, "step": 10962 }, { "epoch": 0.21928, "grad_norm": 2.0625, "grad_norm_var": 0.010237375895182291, "learning_rate": 0.0001, "loss": 3.9749, "loss/crossentropy": 1.9112628102302551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19327937066555023, "step": 10964 }, { "epoch": 0.21932, "grad_norm": 2.03125, "grad_norm_var": 0.010640207926432292, "learning_rate": 0.0001, "loss": 4.3078, "loss/crossentropy": 2.1655001640319824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22729264944791794, "step": 10966 }, { "epoch": 0.21936, "grad_norm": 2.171875, "grad_norm_var": 0.010400136311848959, "learning_rate": 0.0001, "loss": 4.1641, "loss/crossentropy": 2.199634552001953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23565081506967545, "step": 10968 }, { "epoch": 0.2194, "grad_norm": 2.03125, "grad_norm_var": 0.010155995686848959, "learning_rate": 0.0001, "loss": 4.4298, "loss/crossentropy": 2.4142041206359863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23020224273204803, "step": 10970 }, { "epoch": 0.21944, "grad_norm": 2.03125, "grad_norm_var": 0.011193593343098959, "learning_rate": 0.0001, "loss": 4.3856, "loss/crossentropy": 2.2480571269989014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224863201379776, "step": 10972 }, { "epoch": 0.21948, "grad_norm": 1.9453125, "grad_norm_var": 0.012238566080729167, "learning_rate": 0.0001, "loss": 4.099, "loss/crossentropy": 2.1786953806877136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21182173490524292, "step": 10974 }, { "epoch": 0.21952, "grad_norm": 1.9765625, "grad_norm_var": 0.013337961832682292, "learning_rate": 0.0001, "loss": 4.2826, "loss/crossentropy": 2.3767744302749634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22259585559368134, "step": 10976 }, { "epoch": 0.21956, "grad_norm": 2.25, "grad_norm_var": 0.010794830322265626, "learning_rate": 0.0001, "loss": 4.405, "loss/crossentropy": 1.9838140606880188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830464899539948, "step": 10978 }, { "epoch": 0.2196, "grad_norm": 2.046875, "grad_norm_var": 0.0092193603515625, "learning_rate": 0.0001, "loss": 4.3811, "loss/crossentropy": 2.0167009234428406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21239794790744781, "step": 10980 }, { "epoch": 0.21964, "grad_norm": 2.5, "grad_norm_var": 0.019269816080729165, "learning_rate": 0.0001, "loss": 4.3117, "loss/crossentropy": 2.001616358757019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21801268309354782, "step": 10982 }, { "epoch": 0.21968, "grad_norm": 2.1875, "grad_norm_var": 0.019432576497395833, "learning_rate": 0.0001, "loss": 4.6266, "loss/crossentropy": 2.259532332420349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20131191611289978, "step": 10984 }, { "epoch": 0.21972, "grad_norm": 2.09375, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 4.1159, "loss/crossentropy": 1.9223415851593018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21203956007957458, "step": 10986 }, { "epoch": 0.21976, "grad_norm": 2.0, "grad_norm_var": 0.020295206705729166, "learning_rate": 0.0001, "loss": 4.0177, "loss/crossentropy": 2.3779542446136475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23436614871025085, "step": 10988 }, { "epoch": 0.2198, "grad_norm": 2.046875, "grad_norm_var": 0.02195002237955729, "learning_rate": 0.0001, "loss": 4.0412, "loss/crossentropy": 1.8552000522613525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20212603360414505, "step": 10990 }, { "epoch": 0.21984, "grad_norm": 2.03125, "grad_norm_var": 0.0206451416015625, "learning_rate": 0.0001, "loss": 4.2264, "loss/crossentropy": 1.79928320646286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19676420837640762, "step": 10992 }, { "epoch": 0.21988, "grad_norm": 2.078125, "grad_norm_var": 0.018619791666666666, "learning_rate": 0.0001, "loss": 4.2426, "loss/crossentropy": 1.886910319328308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21270643919706345, "step": 10994 }, { "epoch": 0.21992, "grad_norm": 2.078125, "grad_norm_var": 0.019066365559895833, "learning_rate": 0.0001, "loss": 4.0775, "loss/crossentropy": 1.5682110786437988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19159042835235596, "step": 10996 }, { "epoch": 0.21996, "grad_norm": 2.21875, "grad_norm_var": 0.008104451497395833, "learning_rate": 0.0001, "loss": 4.2933, "loss/crossentropy": 2.3335143327713013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23225059360265732, "step": 10998 }, { "epoch": 0.22, "grad_norm": 1.953125, "grad_norm_var": 0.007222493489583333, "learning_rate": 0.0001, "loss": 4.3146, "loss/crossentropy": 2.2234359979629517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21465980261564255, "step": 11000 }, { "epoch": 0.22004, "grad_norm": 2.09375, "grad_norm_var": 0.00601806640625, "learning_rate": 0.0001, "loss": 4.6023, "loss/crossentropy": 2.3676271438598633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2674448639154434, "step": 11002 }, { "epoch": 0.22008, "grad_norm": 2.0, "grad_norm_var": 0.006180826822916667, "learning_rate": 0.0001, "loss": 4.1064, "loss/crossentropy": 1.9216612577438354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2036839798092842, "step": 11004 }, { "epoch": 0.22012, "grad_norm": 2.078125, "grad_norm_var": 0.004369099934895833, "learning_rate": 0.0001, "loss": 4.0427, "loss/crossentropy": 1.6732578873634338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18962469696998596, "step": 11006 }, { "epoch": 0.22016, "grad_norm": 2.265625, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 4.2704, "loss/crossentropy": 2.0608668327331543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23521222919225693, "step": 11008 }, { "epoch": 0.2202, "grad_norm": 2.015625, "grad_norm_var": 0.010872395833333333, "learning_rate": 0.0001, "loss": 4.1808, "loss/crossentropy": 1.8038227558135986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1811496466398239, "step": 11010 }, { "epoch": 0.22024, "grad_norm": 2.15625, "grad_norm_var": 0.010789998372395833, "learning_rate": 0.0001, "loss": 4.3407, "loss/crossentropy": 1.8687627911567688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20196181535720825, "step": 11012 }, { "epoch": 0.22028, "grad_norm": 2.1875, "grad_norm_var": 0.01138916015625, "learning_rate": 0.0001, "loss": 4.0608, "loss/crossentropy": 1.8011687397956848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18384280055761337, "step": 11014 }, { "epoch": 0.22032, "grad_norm": 1.9609375, "grad_norm_var": 0.012727610270182292, "learning_rate": 0.0001, "loss": 4.0229, "loss/crossentropy": 1.8108493089675903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19042345136404037, "step": 11016 }, { "epoch": 0.22036, "grad_norm": 2.09375, "grad_norm_var": 0.012611643473307291, "learning_rate": 0.0001, "loss": 4.2755, "loss/crossentropy": 2.4056872129440308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22949732840061188, "step": 11018 }, { "epoch": 0.2204, "grad_norm": 2.0625, "grad_norm_var": 0.011451975504557291, "learning_rate": 0.0001, "loss": 4.2768, "loss/crossentropy": 2.0417627692222595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21793337166309357, "step": 11020 }, { "epoch": 0.22044, "grad_norm": 2.125, "grad_norm_var": 0.011549631754557291, "learning_rate": 0.0001, "loss": 4.2767, "loss/crossentropy": 1.8312503099441528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19038879871368408, "step": 11022 }, { "epoch": 0.22048, "grad_norm": 2.484375, "grad_norm_var": 0.01941095987955729, "learning_rate": 0.0001, "loss": 4.1846, "loss/crossentropy": 2.0615866780281067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224861241877079, "step": 11024 }, { "epoch": 0.22052, "grad_norm": 1.984375, "grad_norm_var": 0.02702611287434896, "learning_rate": 0.0001, "loss": 4.0216, "loss/crossentropy": 1.8578996062278748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20426958799362183, "step": 11026 }, { "epoch": 0.22056, "grad_norm": 1.9375, "grad_norm_var": 0.028364817301432293, "learning_rate": 0.0001, "loss": 3.9303, "loss/crossentropy": 2.01333224773407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140725627541542, "step": 11028 }, { "epoch": 0.2206, "grad_norm": 1.9609375, "grad_norm_var": 0.0296142578125, "learning_rate": 0.0001, "loss": 3.8142, "loss/crossentropy": 1.9607431292533875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19605513662099838, "step": 11030 }, { "epoch": 0.22064, "grad_norm": 2.078125, "grad_norm_var": 0.026712799072265626, "learning_rate": 0.0001, "loss": 4.1336, "loss/crossentropy": 1.8091335892677307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18755877763032913, "step": 11032 }, { "epoch": 0.22068, "grad_norm": 2.09375, "grad_norm_var": 0.0282958984375, "learning_rate": 0.0001, "loss": 4.0206, "loss/crossentropy": 1.5547168254852295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18891388177871704, "step": 11034 }, { "epoch": 0.22072, "grad_norm": 1.9453125, "grad_norm_var": 0.029605865478515625, "learning_rate": 0.0001, "loss": 4.1658, "loss/crossentropy": 2.0881760120391846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21140296012163162, "step": 11036 }, { "epoch": 0.22076, "grad_norm": 2.953125, "grad_norm_var": 0.07485936482747396, "learning_rate": 0.0001, "loss": 4.0865, "loss/crossentropy": 2.0874351263046265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202945575118065, "step": 11038 }, { "epoch": 0.2208, "grad_norm": 2.109375, "grad_norm_var": 0.06611302693684896, "learning_rate": 0.0001, "loss": 4.3998, "loss/crossentropy": 2.2349069118499756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2247784063220024, "step": 11040 }, { "epoch": 0.22084, "grad_norm": 2.328125, "grad_norm_var": 0.060373687744140626, "learning_rate": 0.0001, "loss": 4.4679, "loss/crossentropy": 2.0404593348503113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23377884924411774, "step": 11042 }, { "epoch": 0.22088, "grad_norm": 2.046875, "grad_norm_var": 0.05997314453125, "learning_rate": 0.0001, "loss": 4.115, "loss/crossentropy": 2.418761968612671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288268655538559, "step": 11044 }, { "epoch": 0.22092, "grad_norm": 2.25, "grad_norm_var": 0.05608495076497396, "learning_rate": 0.0001, "loss": 4.514, "loss/crossentropy": 2.3263691663742065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23381420969963074, "step": 11046 }, { "epoch": 0.22096, "grad_norm": 2.34375, "grad_norm_var": 0.058166249593098955, "learning_rate": 0.0001, "loss": 4.3659, "loss/crossentropy": 2.0020187497138977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993579939007759, "step": 11048 }, { "epoch": 0.221, "grad_norm": 1.984375, "grad_norm_var": 0.05726318359375, "learning_rate": 0.0001, "loss": 4.1869, "loss/crossentropy": 2.3417880535125732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21232012659311295, "step": 11050 }, { "epoch": 0.22104, "grad_norm": 2.375, "grad_norm_var": 0.06346817016601562, "learning_rate": 0.0001, "loss": 4.9882, "loss/crossentropy": 2.2952964305877686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22096505016088486, "step": 11052 }, { "epoch": 0.22108, "grad_norm": 2.015625, "grad_norm_var": 0.028562164306640624, "learning_rate": 0.0001, "loss": 4.2431, "loss/crossentropy": 1.7963152527809143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18996400386095047, "step": 11054 }, { "epoch": 0.22112, "grad_norm": 2.109375, "grad_norm_var": 0.028433990478515626, "learning_rate": 0.0001, "loss": 4.4276, "loss/crossentropy": 2.1328593492507935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21772069483995438, "step": 11056 }, { "epoch": 0.22116, "grad_norm": 2.3125, "grad_norm_var": 0.0318267822265625, "learning_rate": 0.0001, "loss": 4.317, "loss/crossentropy": 1.8048993349075317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17359213531017303, "step": 11058 }, { "epoch": 0.2212, "grad_norm": 1.9609375, "grad_norm_var": 0.0309234619140625, "learning_rate": 0.0001, "loss": 4.0173, "loss/crossentropy": 1.9539333581924438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20828361809253693, "step": 11060 }, { "epoch": 0.22124, "grad_norm": 1.9921875, "grad_norm_var": 0.032291412353515625, "learning_rate": 0.0001, "loss": 4.0288, "loss/crossentropy": 1.8231340050697327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20730753242969513, "step": 11062 }, { "epoch": 0.22128, "grad_norm": 2.25, "grad_norm_var": 0.033878326416015625, "learning_rate": 0.0001, "loss": 4.2756, "loss/crossentropy": 1.9315263032913208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20077265799045563, "step": 11064 }, { "epoch": 0.22132, "grad_norm": 1.9375, "grad_norm_var": 0.034795888264973956, "learning_rate": 0.0001, "loss": 4.2137, "loss/crossentropy": 1.9278368949890137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203744500875473, "step": 11066 }, { "epoch": 0.22136, "grad_norm": 2.21875, "grad_norm_var": 0.016812896728515624, "learning_rate": 0.0001, "loss": 4.38, "loss/crossentropy": 1.9314138889312744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024018093943596, "step": 11068 }, { "epoch": 0.2214, "grad_norm": 1.9609375, "grad_norm_var": 0.019618479410807292, "learning_rate": 0.0001, "loss": 3.9142, "loss/crossentropy": 1.822297751903534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20530518889427185, "step": 11070 }, { "epoch": 0.22144, "grad_norm": 2.09375, "grad_norm_var": 0.02112401326497396, "learning_rate": 0.0001, "loss": 4.1412, "loss/crossentropy": 2.0021358132362366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963522955775261, "step": 11072 }, { "epoch": 0.22148, "grad_norm": 2.15625, "grad_norm_var": 0.018293253580729165, "learning_rate": 0.0001, "loss": 4.5595, "loss/crossentropy": 2.3780601024627686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24999547004699707, "step": 11074 }, { "epoch": 0.22152, "grad_norm": 2.109375, "grad_norm_var": 0.01727472941080729, "learning_rate": 0.0001, "loss": 4.2572, "loss/crossentropy": 1.8128371238708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1940363049507141, "step": 11076 }, { "epoch": 0.22156, "grad_norm": 2.09375, "grad_norm_var": 0.0147857666015625, "learning_rate": 0.0001, "loss": 4.1388, "loss/crossentropy": 1.836561381816864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289480894804, "step": 11078 }, { "epoch": 0.2216, "grad_norm": 2.03125, "grad_norm_var": 0.0125640869140625, "learning_rate": 0.0001, "loss": 3.9891, "loss/crossentropy": 2.203549385070801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23240803182125092, "step": 11080 }, { "epoch": 0.22164, "grad_norm": 1.8359375, "grad_norm_var": 0.0147857666015625, "learning_rate": 0.0001, "loss": 3.9827, "loss/crossentropy": 2.126620829105377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21564993262290955, "step": 11082 }, { "epoch": 0.22168, "grad_norm": 2.203125, "grad_norm_var": 0.015672810872395835, "learning_rate": 0.0001, "loss": 4.6468, "loss/crossentropy": 2.5914783477783203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23293393850326538, "step": 11084 }, { "epoch": 0.22172, "grad_norm": 2.046875, "grad_norm_var": 0.016752115885416665, "learning_rate": 0.0001, "loss": 4.0881, "loss/crossentropy": 2.097515106201172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21570277214050293, "step": 11086 }, { "epoch": 0.22176, "grad_norm": 2.203125, "grad_norm_var": 0.01565526326497396, "learning_rate": 0.0001, "loss": 4.2469, "loss/crossentropy": 1.9408356547355652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22177009284496307, "step": 11088 }, { "epoch": 0.2218, "grad_norm": 2.171875, "grad_norm_var": 0.014713287353515625, "learning_rate": 0.0001, "loss": 4.378, "loss/crossentropy": 1.9851223826408386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20538930594921112, "step": 11090 }, { "epoch": 0.22184, "grad_norm": 2.03125, "grad_norm_var": 0.014694976806640624, "learning_rate": 0.0001, "loss": 4.2424, "loss/crossentropy": 2.261754631996155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21294979751110077, "step": 11092 }, { "epoch": 0.22188, "grad_norm": 2.015625, "grad_norm_var": 0.014924875895182292, "learning_rate": 0.0001, "loss": 4.1847, "loss/crossentropy": 1.978752851486206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21184594929218292, "step": 11094 }, { "epoch": 0.22192, "grad_norm": 2.21875, "grad_norm_var": 0.014918772379557292, "learning_rate": 0.0001, "loss": 4.515, "loss/crossentropy": 2.1992926597595215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2428576499223709, "step": 11096 }, { "epoch": 0.22196, "grad_norm": 2.078125, "grad_norm_var": 0.010131581624348959, "learning_rate": 0.0001, "loss": 4.1475, "loss/crossentropy": 1.9407767057418823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25351718813180923, "step": 11098 }, { "epoch": 0.222, "grad_norm": 2.234375, "grad_norm_var": 0.009993235270182291, "learning_rate": 0.0001, "loss": 4.2485, "loss/crossentropy": 2.2973347902297974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211253046989441, "step": 11100 }, { "epoch": 0.22204, "grad_norm": 2.109375, "grad_norm_var": 0.007380930582682291, "learning_rate": 0.0001, "loss": 4.466, "loss/crossentropy": 2.323284387588501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23907601833343506, "step": 11102 }, { "epoch": 0.22208, "grad_norm": 2.21875, "grad_norm_var": 0.00631103515625, "learning_rate": 0.0001, "loss": 4.3745, "loss/crossentropy": 1.9557610750198364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20206698775291443, "step": 11104 }, { "epoch": 0.22212, "grad_norm": 2.203125, "grad_norm_var": 0.006083170572916667, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 2.130094528198242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22658853232860565, "step": 11106 }, { "epoch": 0.22216, "grad_norm": 2.140625, "grad_norm_var": 0.006078084309895833, "learning_rate": 0.0001, "loss": 4.0207, "loss/crossentropy": 1.6461073160171509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.183306485414505, "step": 11108 }, { "epoch": 0.2222, "grad_norm": 2.265625, "grad_norm_var": 0.008310699462890625, "learning_rate": 0.0001, "loss": 4.2901, "loss/crossentropy": 1.9869291186332703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20724371075630188, "step": 11110 }, { "epoch": 0.22224, "grad_norm": 2.15625, "grad_norm_var": 0.007696278889973958, "learning_rate": 0.0001, "loss": 4.2361, "loss/crossentropy": 2.314267873764038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22680803388357162, "step": 11112 }, { "epoch": 0.22228, "grad_norm": 2.109375, "grad_norm_var": 0.0104888916015625, "learning_rate": 0.0001, "loss": 4.37, "loss/crossentropy": 2.061935067176819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931627094745636, "step": 11114 }, { "epoch": 0.22232, "grad_norm": 2.03125, "grad_norm_var": 0.009601847330729166, "learning_rate": 0.0001, "loss": 4.1174, "loss/crossentropy": 2.157355546951294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328970953822136, "step": 11116 }, { "epoch": 0.22236, "grad_norm": 2.1875, "grad_norm_var": 0.012214914957682291, "learning_rate": 0.0001, "loss": 4.2092, "loss/crossentropy": 1.8968737125396729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19509851932525635, "step": 11118 }, { "epoch": 0.2224, "grad_norm": 2.15625, "grad_norm_var": 0.013516998291015625, "learning_rate": 0.0001, "loss": 4.3433, "loss/crossentropy": 2.1147825717926025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22639968246221542, "step": 11120 }, { "epoch": 0.22244, "grad_norm": 1.8828125, "grad_norm_var": 0.01519775390625, "learning_rate": 0.0001, "loss": 3.9818, "loss/crossentropy": 1.9084222316741943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16847409307956696, "step": 11122 }, { "epoch": 0.22248, "grad_norm": 2.0625, "grad_norm_var": 0.015028635660807291, "learning_rate": 0.0001, "loss": 4.0188, "loss/crossentropy": 1.8558747172355652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013881430029869, "step": 11124 }, { "epoch": 0.22252, "grad_norm": 2.203125, "grad_norm_var": 0.015755208333333333, "learning_rate": 0.0001, "loss": 4.3236, "loss/crossentropy": 2.2021052837371826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2129388153553009, "step": 11126 }, { "epoch": 0.22256, "grad_norm": 1.8125, "grad_norm_var": 0.019212849934895835, "learning_rate": 0.0001, "loss": 4.2892, "loss/crossentropy": 2.267111897468567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22087457031011581, "step": 11128 }, { "epoch": 0.2226, "grad_norm": 2.0625, "grad_norm_var": 0.02021052042643229, "learning_rate": 0.0001, "loss": 4.3781, "loss/crossentropy": 1.860244870185852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21286892145872116, "step": 11130 }, { "epoch": 0.22264, "grad_norm": 2.046875, "grad_norm_var": 0.021142578125, "learning_rate": 0.0001, "loss": 3.7832, "loss/crossentropy": 1.8312670588493347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20548796653747559, "step": 11132 }, { "epoch": 0.22268, "grad_norm": 2.015625, "grad_norm_var": 0.019087473551432293, "learning_rate": 0.0001, "loss": 4.1916, "loss/crossentropy": 2.2309051752090454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244933694601059, "step": 11134 }, { "epoch": 0.22272, "grad_norm": 2.09375, "grad_norm_var": 0.01761449178059896, "learning_rate": 0.0001, "loss": 4.1125, "loss/crossentropy": 2.1003236770629883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22222542017698288, "step": 11136 }, { "epoch": 0.22276, "grad_norm": 1.953125, "grad_norm_var": 0.015965779622395832, "learning_rate": 0.0001, "loss": 3.9391, "loss/crossentropy": 1.95048588514328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614230632781982, "step": 11138 }, { "epoch": 0.2228, "grad_norm": 1.9765625, "grad_norm_var": 0.016364542643229167, "learning_rate": 0.0001, "loss": 4.1608, "loss/crossentropy": 2.2856240272521973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23748096823692322, "step": 11140 }, { "epoch": 0.22284, "grad_norm": 2.03125, "grad_norm_var": 0.012455240885416666, "learning_rate": 0.0001, "loss": 4.2187, "loss/crossentropy": 2.3873090744018555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23743586987257004, "step": 11142 }, { "epoch": 0.22288, "grad_norm": 2.09375, "grad_norm_var": 0.008234659830729166, "learning_rate": 0.0001, "loss": 4.1283, "loss/crossentropy": 2.248707115650177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22035827487707138, "step": 11144 }, { "epoch": 0.22292, "grad_norm": 2.09375, "grad_norm_var": 0.0067860921223958336, "learning_rate": 0.0001, "loss": 4.3561, "loss/crossentropy": 1.9792630672454834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102261707186699, "step": 11146 }, { "epoch": 0.22296, "grad_norm": 2.078125, "grad_norm_var": 0.006156158447265625, "learning_rate": 0.0001, "loss": 4.0479, "loss/crossentropy": 2.284587323665619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20941253006458282, "step": 11148 }, { "epoch": 0.223, "grad_norm": 2.03125, "grad_norm_var": 0.006361643473307292, "learning_rate": 0.0001, "loss": 4.1792, "loss/crossentropy": 1.9800288677215576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20009202510118484, "step": 11150 }, { "epoch": 0.22304, "grad_norm": 2.046875, "grad_norm_var": 0.0064165751139322914, "learning_rate": 0.0001, "loss": 4.1859, "loss/crossentropy": 1.925826370716095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18616148829460144, "step": 11152 }, { "epoch": 0.22308, "grad_norm": 2.046875, "grad_norm_var": 0.0053708394368489586, "learning_rate": 0.0001, "loss": 3.9281, "loss/crossentropy": 2.158196806907654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21486472338438034, "step": 11154 }, { "epoch": 0.22312, "grad_norm": 2.078125, "grad_norm_var": 0.004393513997395833, "learning_rate": 0.0001, "loss": 4.3342, "loss/crossentropy": 2.0873841047286987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22101643681526184, "step": 11156 }, { "epoch": 0.22316, "grad_norm": 2.0625, "grad_norm_var": 0.0044748942057291664, "learning_rate": 0.0001, "loss": 4.3772, "loss/crossentropy": 1.9485042691230774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19757962226867676, "step": 11158 }, { "epoch": 0.2232, "grad_norm": 2.0625, "grad_norm_var": 0.004279581705729166, "learning_rate": 0.0001, "loss": 4.331, "loss/crossentropy": 2.055552661418915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19271192699670792, "step": 11160 }, { "epoch": 0.22324, "grad_norm": 2.046875, "grad_norm_var": 0.0033599853515625, "learning_rate": 0.0001, "loss": 4.0958, "loss/crossentropy": 1.9115247130393982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1922249048948288, "step": 11162 }, { "epoch": 0.22328, "grad_norm": 1.9609375, "grad_norm_var": 0.0043609619140625, "learning_rate": 0.0001, "loss": 4.2582, "loss/crossentropy": 2.1461042761802673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2185581848025322, "step": 11164 }, { "epoch": 0.22332, "grad_norm": 2.109375, "grad_norm_var": 0.004808553059895833, "learning_rate": 0.0001, "loss": 4.1212, "loss/crossentropy": 2.1625128984451294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20505116879940033, "step": 11166 }, { "epoch": 0.22336, "grad_norm": 2.109375, "grad_norm_var": 0.003692372639973958, "learning_rate": 0.0001, "loss": 4.0317, "loss/crossentropy": 1.9978403449058533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015106976032257, "step": 11168 }, { "epoch": 0.2234, "grad_norm": 2.109375, "grad_norm_var": 0.003794097900390625, "learning_rate": 0.0001, "loss": 4.0108, "loss/crossentropy": 1.9462909698486328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20814144611358643, "step": 11170 }, { "epoch": 0.22344, "grad_norm": 1.9453125, "grad_norm_var": 0.004423014322916667, "learning_rate": 0.0001, "loss": 4.2147, "loss/crossentropy": 2.262490153312683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21692577749490738, "step": 11172 }, { "epoch": 0.22348, "grad_norm": 2.140625, "grad_norm_var": 0.0052734375, "learning_rate": 0.0001, "loss": 4.4079, "loss/crossentropy": 2.092001974582672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22943469882011414, "step": 11174 }, { "epoch": 0.22352, "grad_norm": 2.09375, "grad_norm_var": 0.0052734375, "learning_rate": 0.0001, "loss": 4.273, "loss/crossentropy": 2.302329421043396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21188674122095108, "step": 11176 }, { "epoch": 0.22356, "grad_norm": 1.9765625, "grad_norm_var": 0.008701324462890625, "learning_rate": 0.0001, "loss": 4.0762, "loss/crossentropy": 1.7181463837623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19460663199424744, "step": 11178 }, { "epoch": 0.2236, "grad_norm": 2.046875, "grad_norm_var": 0.007895660400390626, "learning_rate": 0.0001, "loss": 4.0061, "loss/crossentropy": 1.8402328491210938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928708627820015, "step": 11180 }, { "epoch": 0.22364, "grad_norm": 2.21875, "grad_norm_var": 0.008103179931640624, "learning_rate": 0.0001, "loss": 4.2968, "loss/crossentropy": 1.9490735530853271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2492925077676773, "step": 11182 }, { "epoch": 0.22368, "grad_norm": 2.015625, "grad_norm_var": 0.00867919921875, "learning_rate": 0.0001, "loss": 4.113, "loss/crossentropy": 2.367197036743164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25882700830698013, "step": 11184 }, { "epoch": 0.22372, "grad_norm": 2.015625, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 4.0782, "loss/crossentropy": 2.1192296743392944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2208312824368477, "step": 11186 }, { "epoch": 0.22376, "grad_norm": 2.296875, "grad_norm_var": 0.010772450764973959, "learning_rate": 0.0001, "loss": 4.7239, "loss/crossentropy": 2.2150460481643677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21440500020980835, "step": 11188 }, { "epoch": 0.2238, "grad_norm": 2.203125, "grad_norm_var": 0.0121490478515625, "learning_rate": 0.0001, "loss": 4.4245, "loss/crossentropy": 2.0837016105651855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2777148336172104, "step": 11190 }, { "epoch": 0.22384, "grad_norm": 1.9609375, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 3.918, "loss/crossentropy": 1.7995057106018066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18295922130346298, "step": 11192 }, { "epoch": 0.22388, "grad_norm": 2.1875, "grad_norm_var": 0.014357248942057291, "learning_rate": 0.0001, "loss": 4.3361, "loss/crossentropy": 2.2150347232818604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24010418355464935, "step": 11194 }, { "epoch": 0.22392, "grad_norm": 2.140625, "grad_norm_var": 0.014357248942057291, "learning_rate": 0.0001, "loss": 4.1711, "loss/crossentropy": 2.1161770820617676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124250829219818, "step": 11196 }, { "epoch": 0.22396, "grad_norm": 2.0625, "grad_norm_var": 0.013578033447265625, "learning_rate": 0.0001, "loss": 4.4875, "loss/crossentropy": 2.224100112915039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23966734111309052, "step": 11198 }, { "epoch": 0.224, "grad_norm": 2.09375, "grad_norm_var": 0.011921946207682292, "learning_rate": 0.0001, "loss": 4.377, "loss/crossentropy": 2.196989417076111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21709956228733063, "step": 11200 }, { "epoch": 0.22404, "grad_norm": 2.03125, "grad_norm_var": 0.011574045817057291, "learning_rate": 0.0001, "loss": 4.0457, "loss/crossentropy": 1.6592280864715576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827787384390831, "step": 11202 }, { "epoch": 0.22408, "grad_norm": 2.109375, "grad_norm_var": 0.008790842692057292, "learning_rate": 0.0001, "loss": 4.0564, "loss/crossentropy": 1.9689467549324036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20387520641088486, "step": 11204 }, { "epoch": 0.22412, "grad_norm": 2.8125, "grad_norm_var": 0.03997802734375, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 2.1846379041671753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068176046013832, "step": 11206 }, { "epoch": 0.22416, "grad_norm": 2.0, "grad_norm_var": 0.037353515625, "learning_rate": 0.0001, "loss": 4.3948, "loss/crossentropy": 1.8361602425575256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22452392429113388, "step": 11208 }, { "epoch": 0.2242, "grad_norm": 1.9375, "grad_norm_var": 0.0390625, "learning_rate": 0.0001, "loss": 4.3597, "loss/crossentropy": 2.5658172369003296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2602368891239166, "step": 11210 }, { "epoch": 0.22424, "grad_norm": 1.9296875, "grad_norm_var": 0.04119440714518229, "learning_rate": 0.0001, "loss": 4.0555, "loss/crossentropy": 1.8167916536331177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20330622047185898, "step": 11212 }, { "epoch": 0.22428, "grad_norm": 2.140625, "grad_norm_var": 0.04070612589518229, "learning_rate": 0.0001, "loss": 4.2374, "loss/crossentropy": 1.9151161313056946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20695462822914124, "step": 11214 }, { "epoch": 0.22432, "grad_norm": 1.8515625, "grad_norm_var": 0.04528401692708333, "learning_rate": 0.0001, "loss": 3.9807, "loss/crossentropy": 1.9590752720832825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945583075284958, "step": 11216 }, { "epoch": 0.22436, "grad_norm": 2.0625, "grad_norm_var": 0.04522298177083333, "learning_rate": 0.0001, "loss": 4.2803, "loss/crossentropy": 1.9777710437774658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20104076713323593, "step": 11218 }, { "epoch": 0.2244, "grad_norm": 1.9296875, "grad_norm_var": 0.04744847615559896, "learning_rate": 0.0001, "loss": 4.0279, "loss/crossentropy": 1.725940465927124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19038556516170502, "step": 11220 }, { "epoch": 0.22444, "grad_norm": 2.203125, "grad_norm_var": 0.012963612874348959, "learning_rate": 0.0001, "loss": 4.4067, "loss/crossentropy": 2.505728602409363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2434321641921997, "step": 11222 }, { "epoch": 0.22448, "grad_norm": 2.03125, "grad_norm_var": 0.010518137613932292, "learning_rate": 0.0001, "loss": 4.2902, "loss/crossentropy": 2.0733951330184937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21223794668912888, "step": 11224 }, { "epoch": 0.22452, "grad_norm": 2.03125, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 4.1548, "loss/crossentropy": 2.105073928833008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22444891929626465, "step": 11226 }, { "epoch": 0.22456, "grad_norm": 2.109375, "grad_norm_var": 0.009745025634765625, "learning_rate": 0.0001, "loss": 4.2361, "loss/crossentropy": 1.9164994359016418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20893585681915283, "step": 11228 }, { "epoch": 0.2246, "grad_norm": 2.328125, "grad_norm_var": 0.014902496337890625, "learning_rate": 0.0001, "loss": 4.3753, "loss/crossentropy": 2.0922536849975586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22757402062416077, "step": 11230 }, { "epoch": 0.22464, "grad_norm": 2.015625, "grad_norm_var": 0.01190185546875, "learning_rate": 0.0001, "loss": 3.9791, "loss/crossentropy": 1.886056363582611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054021805524826, "step": 11232 }, { "epoch": 0.22468, "grad_norm": 1.9609375, "grad_norm_var": 0.014300282796223958, "learning_rate": 0.0001, "loss": 3.9194, "loss/crossentropy": 2.028389871120453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109459862112999, "step": 11234 }, { "epoch": 0.22472, "grad_norm": 2.078125, "grad_norm_var": 0.012813313802083334, "learning_rate": 0.0001, "loss": 4.2084, "loss/crossentropy": 1.9590765833854675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19774210453033447, "step": 11236 }, { "epoch": 0.22476, "grad_norm": 2.03125, "grad_norm_var": 0.012800852457682291, "learning_rate": 0.0001, "loss": 4.0153, "loss/crossentropy": 1.8284756541252136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17485490441322327, "step": 11238 }, { "epoch": 0.2248, "grad_norm": 2.203125, "grad_norm_var": 0.014625803629557291, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 2.1119225025177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22235971689224243, "step": 11240 }, { "epoch": 0.22484, "grad_norm": 2.1875, "grad_norm_var": 0.015209706624348958, "learning_rate": 0.0001, "loss": 4.3303, "loss/crossentropy": 2.3050538301467896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2249491587281227, "step": 11242 }, { "epoch": 0.22488, "grad_norm": 2.0625, "grad_norm_var": 0.015360514322916666, "learning_rate": 0.0001, "loss": 4.1235, "loss/crossentropy": 1.9806398153305054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20141054689884186, "step": 11244 }, { "epoch": 0.22492, "grad_norm": 1.9609375, "grad_norm_var": 0.007879384358723958, "learning_rate": 0.0001, "loss": 4.0793, "loss/crossentropy": 2.0702012181282043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980852484703064, "step": 11246 }, { "epoch": 0.22496, "grad_norm": 2.0625, "grad_norm_var": 0.008113606770833334, "learning_rate": 0.0001, "loss": 3.9997, "loss/crossentropy": 2.205365300178528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22107956558465958, "step": 11248 }, { "epoch": 0.225, "grad_norm": 2.890625, "grad_norm_var": 0.05272598266601562, "learning_rate": 0.0001, "loss": 4.0748, "loss/crossentropy": 2.196021556854248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199070006608963, "step": 11250 }, { "epoch": 0.22504, "grad_norm": 2.140625, "grad_norm_var": 0.05241673787434896, "learning_rate": 0.0001, "loss": 4.2062, "loss/crossentropy": 2.0661864280700684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22117173671722412, "step": 11252 }, { "epoch": 0.22508, "grad_norm": 2.15625, "grad_norm_var": 0.050687408447265624, "learning_rate": 0.0001, "loss": 3.9176, "loss/crossentropy": 1.825901210308075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.174439437687397, "step": 11254 }, { "epoch": 0.22512, "grad_norm": 2.3125, "grad_norm_var": 0.061470286051432295, "learning_rate": 0.0001, "loss": 4.7052, "loss/crossentropy": 2.3710498809814453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2357504665851593, "step": 11256 }, { "epoch": 0.22516, "grad_norm": 1.859375, "grad_norm_var": 0.06520182291666667, "learning_rate": 0.0001, "loss": 3.9419, "loss/crossentropy": 2.1523157358169556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179568186402321, "step": 11258 }, { "epoch": 0.2252, "grad_norm": 2.0, "grad_norm_var": 0.06559015909830729, "learning_rate": 0.0001, "loss": 4.1953, "loss/crossentropy": 2.0974292755126953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980036199092865, "step": 11260 }, { "epoch": 0.22524, "grad_norm": 2.234375, "grad_norm_var": 0.06339925130208333, "learning_rate": 0.0001, "loss": 4.4409, "loss/crossentropy": 2.2679883241653442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22085320204496384, "step": 11262 }, { "epoch": 0.22528, "grad_norm": 2.109375, "grad_norm_var": 0.06108373006184896, "learning_rate": 0.0001, "loss": 4.2972, "loss/crossentropy": 2.0697352290153503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21976519376039505, "step": 11264 }, { "epoch": 0.22532, "grad_norm": 2.03125, "grad_norm_var": 0.025465647379557293, "learning_rate": 0.0001, "loss": 4.4149, "loss/crossentropy": 2.1807767748832703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21304857730865479, "step": 11266 }, { "epoch": 0.22536, "grad_norm": 2.140625, "grad_norm_var": 0.02541071573893229, "learning_rate": 0.0001, "loss": 4.4395, "loss/crossentropy": 2.178891122341156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22129195928573608, "step": 11268 }, { "epoch": 0.2254, "grad_norm": 2.109375, "grad_norm_var": 0.023607381184895835, "learning_rate": 0.0001, "loss": 4.2908, "loss/crossentropy": 2.1025387048721313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20581847429275513, "step": 11270 }, { "epoch": 0.22544, "grad_norm": 1.8671875, "grad_norm_var": 0.013055165608723959, "learning_rate": 0.0001, "loss": 4.2192, "loss/crossentropy": 1.8710424900054932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900918111205101, "step": 11272 }, { "epoch": 0.22548, "grad_norm": 2.296875, "grad_norm_var": 0.012373606363932291, "learning_rate": 0.0001, "loss": 4.498, "loss/crossentropy": 1.8837561011314392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21019018441438675, "step": 11274 }, { "epoch": 0.22552, "grad_norm": 1.984375, "grad_norm_var": 0.013132476806640625, "learning_rate": 0.0001, "loss": 4.3139, "loss/crossentropy": 2.2795380353927612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23858627676963806, "step": 11276 }, { "epoch": 0.22556, "grad_norm": 1.9765625, "grad_norm_var": 0.0158111572265625, "learning_rate": 0.0001, "loss": 4.0532, "loss/crossentropy": 2.0241262316703796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1990864798426628, "step": 11278 }, { "epoch": 0.2256, "grad_norm": 2.40625, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 4.2935, "loss/crossentropy": 1.9307058453559875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100597321987152, "step": 11280 }, { "epoch": 0.22564, "grad_norm": 2.171875, "grad_norm_var": 0.020414225260416665, "learning_rate": 0.0001, "loss": 4.2523, "loss/crossentropy": 2.08352792263031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21930356323719025, "step": 11282 }, { "epoch": 0.22568, "grad_norm": 2.171875, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 4.4277, "loss/crossentropy": 2.393290877342224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560981214046478, "step": 11284 }, { "epoch": 0.22572, "grad_norm": 2.0, "grad_norm_var": 0.023738606770833334, "learning_rate": 0.0001, "loss": 4.1612, "loss/crossentropy": 2.20097017288208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20318082720041275, "step": 11286 }, { "epoch": 0.22576, "grad_norm": 1.9453125, "grad_norm_var": 0.0221099853515625, "learning_rate": 0.0001, "loss": 4.2307, "loss/crossentropy": 2.031981647014618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20313747972249985, "step": 11288 }, { "epoch": 0.2258, "grad_norm": 2.078125, "grad_norm_var": 0.0187255859375, "learning_rate": 0.0001, "loss": 4.3513, "loss/crossentropy": 2.197006046772003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2301759570837021, "step": 11290 }, { "epoch": 0.22584, "grad_norm": 2.0625, "grad_norm_var": 0.0197021484375, "learning_rate": 0.0001, "loss": 4.2903, "loss/crossentropy": 2.294014096260071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21280069649219513, "step": 11292 }, { "epoch": 0.22588, "grad_norm": 1.9921875, "grad_norm_var": 0.018089803059895833, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 2.0551719665527344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20910422503948212, "step": 11294 }, { "epoch": 0.22592, "grad_norm": 2.109375, "grad_norm_var": 0.010636393229166667, "learning_rate": 0.0001, "loss": 4.2109, "loss/crossentropy": 2.1044358015060425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20463456213474274, "step": 11296 }, { "epoch": 0.22596, "grad_norm": 2.265625, "grad_norm_var": 0.012631988525390625, "learning_rate": 0.0001, "loss": 4.198, "loss/crossentropy": 2.1013529300689697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21202504634857178, "step": 11298 }, { "epoch": 0.226, "grad_norm": 2.046875, "grad_norm_var": 0.009329986572265626, "learning_rate": 0.0001, "loss": 4.4227, "loss/crossentropy": 2.1887649297714233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048492729663849, "step": 11300 }, { "epoch": 0.22604, "grad_norm": 2.015625, "grad_norm_var": 0.009525299072265625, "learning_rate": 0.0001, "loss": 4.0952, "loss/crossentropy": 1.8243364691734314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013952136039734, "step": 11302 }, { "epoch": 0.22608, "grad_norm": 2.09375, "grad_norm_var": 0.0137451171875, "learning_rate": 0.0001, "loss": 4.4636, "loss/crossentropy": 2.2550541162490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2493698000907898, "step": 11304 }, { "epoch": 0.22612, "grad_norm": 1.953125, "grad_norm_var": 0.0149169921875, "learning_rate": 0.0001, "loss": 3.9407, "loss/crossentropy": 1.8744492530822754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19957631081342697, "step": 11306 }, { "epoch": 0.22616, "grad_norm": 2.015625, "grad_norm_var": 0.013681793212890625, "learning_rate": 0.0001, "loss": 4.0717, "loss/crossentropy": 2.026526629924774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20474696904420853, "step": 11308 }, { "epoch": 0.2262, "grad_norm": 1.9453125, "grad_norm_var": 0.015881093343098958, "learning_rate": 0.0001, "loss": 4.3898, "loss/crossentropy": 2.3802725076675415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.247590571641922, "step": 11310 }, { "epoch": 0.22624, "grad_norm": 2.015625, "grad_norm_var": 0.016123199462890626, "learning_rate": 0.0001, "loss": 4.171, "loss/crossentropy": 1.9359605312347412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207523413002491, "step": 11312 }, { "epoch": 0.22628, "grad_norm": 2.046875, "grad_norm_var": 0.0129547119140625, "learning_rate": 0.0001, "loss": 4.1634, "loss/crossentropy": 2.3035311698913574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21815945208072662, "step": 11314 }, { "epoch": 0.22632, "grad_norm": 2.234375, "grad_norm_var": 0.0225341796875, "learning_rate": 0.0001, "loss": 4.3189, "loss/crossentropy": 1.8083779215812683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20705801248550415, "step": 11316 }, { "epoch": 0.22636, "grad_norm": 2.203125, "grad_norm_var": 0.0205718994140625, "learning_rate": 0.0001, "loss": 4.2912, "loss/crossentropy": 2.306682825088501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666794806718826, "step": 11318 }, { "epoch": 0.2264, "grad_norm": 2.0, "grad_norm_var": 0.01737060546875, "learning_rate": 0.0001, "loss": 4.0519, "loss/crossentropy": 1.809365153312683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20744331926107407, "step": 11320 }, { "epoch": 0.22644, "grad_norm": 2.265625, "grad_norm_var": 0.017894490559895834, "learning_rate": 0.0001, "loss": 4.2428, "loss/crossentropy": 2.289853572845459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23791835457086563, "step": 11322 }, { "epoch": 0.22648, "grad_norm": 2.53125, "grad_norm_var": 0.027341461181640624, "learning_rate": 0.0001, "loss": 4.5668, "loss/crossentropy": 2.0609280467033386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166810780763626, "step": 11324 }, { "epoch": 0.22652, "grad_norm": 2.0625, "grad_norm_var": 0.02535400390625, "learning_rate": 0.0001, "loss": 4.0567, "loss/crossentropy": 2.1174912452697754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515341311693192, "step": 11326 }, { "epoch": 0.22656, "grad_norm": 1.890625, "grad_norm_var": 0.029319000244140626, "learning_rate": 0.0001, "loss": 4.0318, "loss/crossentropy": 1.99192476272583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20127686113119125, "step": 11328 }, { "epoch": 0.2266, "grad_norm": 1.9921875, "grad_norm_var": 0.031060536702473957, "learning_rate": 0.0001, "loss": 3.9552, "loss/crossentropy": 1.7805609107017517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17500489950180054, "step": 11330 }, { "epoch": 0.22664, "grad_norm": 2.171875, "grad_norm_var": 0.02398656209309896, "learning_rate": 0.0001, "loss": 4.1417, "loss/crossentropy": 2.024085283279419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20659280568361282, "step": 11332 }, { "epoch": 0.22668, "grad_norm": 1.9609375, "grad_norm_var": 0.023631795247395834, "learning_rate": 0.0001, "loss": 4.2674, "loss/crossentropy": 2.174618899822235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23250284045934677, "step": 11334 }, { "epoch": 0.22672, "grad_norm": 1.9375, "grad_norm_var": 0.02451171875, "learning_rate": 0.0001, "loss": 4.213, "loss/crossentropy": 1.886943757534027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18855806440114975, "step": 11336 }, { "epoch": 0.22676, "grad_norm": 1.9609375, "grad_norm_var": 0.02182184855143229, "learning_rate": 0.0001, "loss": 4.1281, "loss/crossentropy": 2.060371160507202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951771154999733, "step": 11338 }, { "epoch": 0.2268, "grad_norm": 2.046875, "grad_norm_var": 0.005576324462890625, "learning_rate": 0.0001, "loss": 4.1692, "loss/crossentropy": 2.15705668926239, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23738030344247818, "step": 11340 }, { "epoch": 0.22684, "grad_norm": 2.015625, "grad_norm_var": 0.005576324462890625, "learning_rate": 0.0001, "loss": 4.2096, "loss/crossentropy": 2.1922959089279175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24581187963485718, "step": 11342 }, { "epoch": 0.22688, "grad_norm": 2.0, "grad_norm_var": 0.007303873697916667, "learning_rate": 0.0001, "loss": 3.9786, "loss/crossentropy": 2.1590365171432495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20936457812786102, "step": 11344 }, { "epoch": 0.22692, "grad_norm": 2.53125, "grad_norm_var": 0.020783487955729166, "learning_rate": 0.0001, "loss": 4.2382, "loss/crossentropy": 1.8087702989578247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18874070048332214, "step": 11346 }, { "epoch": 0.22696, "grad_norm": 2.203125, "grad_norm_var": 0.021732584635416666, "learning_rate": 0.0001, "loss": 4.4981, "loss/crossentropy": 2.5205971002578735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24057865887880325, "step": 11348 }, { "epoch": 0.227, "grad_norm": 2.078125, "grad_norm_var": 0.02061945597330729, "learning_rate": 0.0001, "loss": 4.16, "loss/crossentropy": 2.114508092403412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20631257444620132, "step": 11350 }, { "epoch": 0.22704, "grad_norm": 2.21875, "grad_norm_var": 0.020881144205729167, "learning_rate": 0.0001, "loss": 4.4344, "loss/crossentropy": 2.385537028312683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.233476921916008, "step": 11352 }, { "epoch": 0.22708, "grad_norm": 2.21875, "grad_norm_var": 0.0217926025390625, "learning_rate": 0.0001, "loss": 4.3335, "loss/crossentropy": 2.4251039028167725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24502182751893997, "step": 11354 }, { "epoch": 0.22712, "grad_norm": 2.078125, "grad_norm_var": 0.025690714518229168, "learning_rate": 0.0001, "loss": 4.345, "loss/crossentropy": 2.278030514717102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22328510880470276, "step": 11356 }, { "epoch": 0.22716, "grad_norm": 2.03125, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 4.2835, "loss/crossentropy": 1.7373265027999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17635879665613174, "step": 11358 }, { "epoch": 0.2272, "grad_norm": 2.25, "grad_norm_var": 0.024461873372395835, "learning_rate": 0.0001, "loss": 4.3644, "loss/crossentropy": 2.1546722650527954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22285740077495575, "step": 11360 }, { "epoch": 0.22724, "grad_norm": 2.140625, "grad_norm_var": 0.0143218994140625, "learning_rate": 0.0001, "loss": 4.5415, "loss/crossentropy": 2.341711401939392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23693612217903137, "step": 11362 }, { "epoch": 0.22728, "grad_norm": 2.65625, "grad_norm_var": 0.0461090087890625, "learning_rate": 0.0001, "loss": 4.3131, "loss/crossentropy": 1.8764930367469788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19202134013175964, "step": 11364 }, { "epoch": 0.22732, "grad_norm": 2.0625, "grad_norm_var": 0.0450439453125, "learning_rate": 0.0001, "loss": 4.1172, "loss/crossentropy": 2.035600185394287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2188200280070305, "step": 11366 }, { "epoch": 0.22736, "grad_norm": 2.015625, "grad_norm_var": 0.045481109619140626, "learning_rate": 0.0001, "loss": 4.0469, "loss/crossentropy": 2.120850682258606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22664117813110352, "step": 11368 }, { "epoch": 0.2274, "grad_norm": 2.03125, "grad_norm_var": 0.050455729166666664, "learning_rate": 0.0001, "loss": 3.9552, "loss/crossentropy": 1.9753262996673584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000861585140228, "step": 11370 }, { "epoch": 0.22744, "grad_norm": 2.703125, "grad_norm_var": 0.06503499348958333, "learning_rate": 0.0001, "loss": 4.2474, "loss/crossentropy": 2.0264564156532288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026129513978958, "step": 11372 }, { "epoch": 0.22748, "grad_norm": 1.890625, "grad_norm_var": 0.07055562337239583, "learning_rate": 0.0001, "loss": 3.8546, "loss/crossentropy": 1.9269734025001526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22436296939849854, "step": 11374 }, { "epoch": 0.22752, "grad_norm": 2.09375, "grad_norm_var": 0.07017822265625, "learning_rate": 0.0001, "loss": 4.4164, "loss/crossentropy": 2.1881991624832153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22483092546463013, "step": 11376 }, { "epoch": 0.22756, "grad_norm": 2.015625, "grad_norm_var": 0.07446187337239583, "learning_rate": 0.0001, "loss": 4.0277, "loss/crossentropy": 2.0142401456832886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21913451701402664, "step": 11378 }, { "epoch": 0.2276, "grad_norm": 2.09375, "grad_norm_var": 0.05676676432291667, "learning_rate": 0.0001, "loss": 4.3374, "loss/crossentropy": 2.0229859352111816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.238722562789917, "step": 11380 }, { "epoch": 0.22764, "grad_norm": 1.9921875, "grad_norm_var": 0.05724054972330729, "learning_rate": 0.0001, "loss": 4.0223, "loss/crossentropy": 2.043630540370941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21258120238780975, "step": 11382 }, { "epoch": 0.22768, "grad_norm": 2.109375, "grad_norm_var": 0.06020685831705729, "learning_rate": 0.0001, "loss": 4.0752, "loss/crossentropy": 2.702946662902832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24362115561962128, "step": 11384 }, { "epoch": 0.22772, "grad_norm": 2.046875, "grad_norm_var": 0.05449600219726562, "learning_rate": 0.0001, "loss": 3.9806, "loss/crossentropy": 2.243411898612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22256402671337128, "step": 11386 }, { "epoch": 0.22776, "grad_norm": 2.078125, "grad_norm_var": 0.033699289957682295, "learning_rate": 0.0001, "loss": 3.7808, "loss/crossentropy": 2.1451956033706665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21471337974071503, "step": 11388 }, { "epoch": 0.2278, "grad_norm": 2.140625, "grad_norm_var": 0.030775705973307293, "learning_rate": 0.0001, "loss": 4.2865, "loss/crossentropy": 2.426279664039612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22846391052007675, "step": 11390 }, { "epoch": 0.22784, "grad_norm": 2.15625, "grad_norm_var": 0.033455149332682295, "learning_rate": 0.0001, "loss": 4.2878, "loss/crossentropy": 2.0890655517578125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23008010536432266, "step": 11392 }, { "epoch": 0.22788, "grad_norm": 2.046875, "grad_norm_var": 0.030452219645182292, "learning_rate": 0.0001, "loss": 4.2124, "loss/crossentropy": 2.3496296405792236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23718956112861633, "step": 11394 }, { "epoch": 0.22792, "grad_norm": 2.171875, "grad_norm_var": 0.013152821858723959, "learning_rate": 0.0001, "loss": 4.0025, "loss/crossentropy": 2.02141535282135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21102941036224365, "step": 11396 }, { "epoch": 0.22796, "grad_norm": 2.078125, "grad_norm_var": 0.01275634765625, "learning_rate": 0.0001, "loss": 4.4817, "loss/crossentropy": 2.213461995124817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21838735044002533, "step": 11398 }, { "epoch": 0.228, "grad_norm": 2.0625, "grad_norm_var": 0.11743062337239583, "learning_rate": 0.0001, "loss": 4.2313, "loss/crossentropy": 2.161481499671936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24635545909404755, "step": 11400 }, { "epoch": 0.22804, "grad_norm": 2.125, "grad_norm_var": 0.1158843994140625, "learning_rate": 0.0001, "loss": 4.5372, "loss/crossentropy": 2.0998951196670532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2205812931060791, "step": 11402 }, { "epoch": 0.22808, "grad_norm": 2.0625, "grad_norm_var": 0.11588109334309896, "learning_rate": 0.0001, "loss": 4.1606, "loss/crossentropy": 1.8785207867622375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20330313593149185, "step": 11404 }, { "epoch": 0.22812, "grad_norm": 2.03125, "grad_norm_var": 0.11588109334309896, "learning_rate": 0.0001, "loss": 4.2408, "loss/crossentropy": 2.254656672477722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220963753759861, "step": 11406 }, { "epoch": 0.22816, "grad_norm": 2.078125, "grad_norm_var": 0.11553726196289063, "learning_rate": 0.0001, "loss": 4.1356, "loss/crossentropy": 1.7936404347419739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878006011247635, "step": 11408 }, { "epoch": 0.2282, "grad_norm": 2.453125, "grad_norm_var": 0.12290445963541667, "learning_rate": 0.0001, "loss": 4.2199, "loss/crossentropy": 1.8537682890892029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893099844455719, "step": 11410 }, { "epoch": 0.22824, "grad_norm": 3.234375, "grad_norm_var": 0.1904205322265625, "learning_rate": 0.0001, "loss": 4.1628, "loss/crossentropy": 2.0987170338630676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072230949997902, "step": 11412 }, { "epoch": 0.22828, "grad_norm": 2.078125, "grad_norm_var": 0.1907958984375, "learning_rate": 0.0001, "loss": 4.2473, "loss/crossentropy": 2.049329698085785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142597660422325, "step": 11414 }, { "epoch": 0.22832, "grad_norm": 2.03125, "grad_norm_var": 0.09533284505208334, "learning_rate": 0.0001, "loss": 4.283, "loss/crossentropy": 2.066833019256592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22086824476718903, "step": 11416 }, { "epoch": 0.22836, "grad_norm": 2.15625, "grad_norm_var": 0.09519755045572917, "learning_rate": 0.0001, "loss": 4.1837, "loss/crossentropy": 2.01213002204895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23191991448402405, "step": 11418 }, { "epoch": 0.2284, "grad_norm": 5.6875, "grad_norm_var": 0.8556495666503906, "learning_rate": 0.0001, "loss": 4.1977, "loss/crossentropy": 2.1349334716796875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22979671508073807, "step": 11420 }, { "epoch": 0.22844, "grad_norm": 2.15625, "grad_norm_var": 0.8487709045410157, "learning_rate": 0.0001, "loss": 4.2101, "loss/crossentropy": 1.9201850295066833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21903745830059052, "step": 11422 }, { "epoch": 0.22848, "grad_norm": 2.109375, "grad_norm_var": 0.8546376546223958, "learning_rate": 0.0001, "loss": 4.1927, "loss/crossentropy": 2.011150360107422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059009075164795, "step": 11424 }, { "epoch": 0.22852, "grad_norm": 2.171875, "grad_norm_var": 0.8533322652180989, "learning_rate": 0.0001, "loss": 4.3762, "loss/crossentropy": 2.1529648303985596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23028475046157837, "step": 11426 }, { "epoch": 0.22856, "grad_norm": 1.8828125, "grad_norm_var": 0.8176177978515625, "learning_rate": 0.0001, "loss": 3.9657, "loss/crossentropy": 2.1602566838264465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262946516275406, "step": 11428 }, { "epoch": 0.2286, "grad_norm": 1.953125, "grad_norm_var": 0.829766591389974, "learning_rate": 0.0001, "loss": 3.9369, "loss/crossentropy": 1.7230631113052368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19715693593025208, "step": 11430 }, { "epoch": 0.22864, "grad_norm": 1.953125, "grad_norm_var": 0.8287737528483073, "learning_rate": 0.0001, "loss": 4.3791, "loss/crossentropy": 2.4396276473999023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24283458292484283, "step": 11432 }, { "epoch": 0.22868, "grad_norm": 2.171875, "grad_norm_var": 0.8311480204264323, "learning_rate": 0.0001, "loss": 4.2692, "loss/crossentropy": 1.8615645170211792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21288780122995377, "step": 11434 }, { "epoch": 0.22872, "grad_norm": 2.09375, "grad_norm_var": 0.012941233317057292, "learning_rate": 0.0001, "loss": 4.394, "loss/crossentropy": 1.8972707390785217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18510619550943375, "step": 11436 }, { "epoch": 0.22876, "grad_norm": 2.0, "grad_norm_var": 0.011980946858723958, "learning_rate": 0.0001, "loss": 4.1459, "loss/crossentropy": 2.217754364013672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21217356622219086, "step": 11438 }, { "epoch": 0.2288, "grad_norm": 2.09375, "grad_norm_var": 0.014818318684895833, "learning_rate": 0.0001, "loss": 3.7955, "loss/crossentropy": 1.6481398940086365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17804963141679764, "step": 11440 }, { "epoch": 0.22884, "grad_norm": 2.0625, "grad_norm_var": 0.01375732421875, "learning_rate": 0.0001, "loss": 4.3291, "loss/crossentropy": 2.026508390903473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20739784091711044, "step": 11442 }, { "epoch": 0.22888, "grad_norm": 2.09375, "grad_norm_var": 0.012334950764973958, "learning_rate": 0.0001, "loss": 4.2086, "loss/crossentropy": 2.05685293674469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22639908641576767, "step": 11444 }, { "epoch": 0.22892, "grad_norm": 2.03125, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 4.4689, "loss/crossentropy": 2.073192059993744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21727359294891357, "step": 11446 }, { "epoch": 0.22896, "grad_norm": 2.171875, "grad_norm_var": 0.0092193603515625, "learning_rate": 0.0001, "loss": 4.4312, "loss/crossentropy": 2.1311055421829224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19499187916517258, "step": 11448 }, { "epoch": 0.229, "grad_norm": 2.1875, "grad_norm_var": 0.01021728515625, "learning_rate": 0.0001, "loss": 4.2043, "loss/crossentropy": 2.169051766395569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22266974300146103, "step": 11450 }, { "epoch": 0.22904, "grad_norm": 1.875, "grad_norm_var": 0.013727823893229166, "learning_rate": 0.0001, "loss": 4.1784, "loss/crossentropy": 2.022417426109314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20541484653949738, "step": 11452 }, { "epoch": 0.22908, "grad_norm": 2.140625, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 4.2921, "loss/crossentropy": 2.2652071714401245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21639510244131088, "step": 11454 }, { "epoch": 0.22912, "grad_norm": 2.203125, "grad_norm_var": 0.01343994140625, "learning_rate": 0.0001, "loss": 4.1575, "loss/crossentropy": 1.912036418914795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2204061597585678, "step": 11456 }, { "epoch": 0.22916, "grad_norm": 2.0625, "grad_norm_var": 0.014188639322916667, "learning_rate": 0.0001, "loss": 4.4497, "loss/crossentropy": 2.029780328273773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032444253563881, "step": 11458 }, { "epoch": 0.2292, "grad_norm": 2.09375, "grad_norm_var": 0.016039021809895835, "learning_rate": 0.0001, "loss": 4.0627, "loss/crossentropy": 1.954946756362915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2028508484363556, "step": 11460 }, { "epoch": 0.22924, "grad_norm": 2.8125, "grad_norm_var": 0.049494425455729164, "learning_rate": 0.0001, "loss": 4.5787, "loss/crossentropy": 2.3707855939865112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460392713546753, "step": 11462 }, { "epoch": 0.22928, "grad_norm": 2.078125, "grad_norm_var": 0.047972615559895834, "learning_rate": 0.0001, "loss": 4.1509, "loss/crossentropy": 2.410157322883606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24578960239887238, "step": 11464 }, { "epoch": 0.22932, "grad_norm": 1.953125, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 4.0935, "loss/crossentropy": 2.1403380036354065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20563968271017075, "step": 11466 }, { "epoch": 0.22936, "grad_norm": 1.953125, "grad_norm_var": 0.0471588134765625, "learning_rate": 0.0001, "loss": 4.0991, "loss/crossentropy": 2.1596190333366394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24405791610479355, "step": 11468 }, { "epoch": 0.2294, "grad_norm": 2.421875, "grad_norm_var": 0.0506744384765625, "learning_rate": 0.0001, "loss": 4.3973, "loss/crossentropy": 1.9427857398986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22412577271461487, "step": 11470 }, { "epoch": 0.22944, "grad_norm": 1.8984375, "grad_norm_var": 0.05306574503580729, "learning_rate": 0.0001, "loss": 4.1093, "loss/crossentropy": 2.078735053539276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844466239213943, "step": 11472 }, { "epoch": 0.22948, "grad_norm": 2.0, "grad_norm_var": 0.054323069254557294, "learning_rate": 0.0001, "loss": 4.0749, "loss/crossentropy": 2.351656198501587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24430027604103088, "step": 11474 }, { "epoch": 0.22952, "grad_norm": 2.09375, "grad_norm_var": 0.05269139607747396, "learning_rate": 0.0001, "loss": 3.9647, "loss/crossentropy": 2.057854652404785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21224451810121536, "step": 11476 }, { "epoch": 0.22956, "grad_norm": 2.03125, "grad_norm_var": 0.018534088134765626, "learning_rate": 0.0001, "loss": 4.1451, "loss/crossentropy": 2.207979917526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22855235636234283, "step": 11478 }, { "epoch": 0.2296, "grad_norm": 2.125, "grad_norm_var": 0.01870905558268229, "learning_rate": 0.0001, "loss": 4.3748, "loss/crossentropy": 2.08840012550354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21988992393016815, "step": 11480 }, { "epoch": 0.22964, "grad_norm": 2.125, "grad_norm_var": 0.017144521077473957, "learning_rate": 0.0001, "loss": 4.2314, "loss/crossentropy": 2.1339274644851685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278972491621971, "step": 11482 }, { "epoch": 0.22968, "grad_norm": 2.15625, "grad_norm_var": 0.015386708577473958, "learning_rate": 0.0001, "loss": 4.2013, "loss/crossentropy": 1.970679223537445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20286859571933746, "step": 11484 }, { "epoch": 0.22972, "grad_norm": 1.984375, "grad_norm_var": 0.005295562744140625, "learning_rate": 0.0001, "loss": 4.2443, "loss/crossentropy": 2.1023008823394775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21906304359436035, "step": 11486 }, { "epoch": 0.22976, "grad_norm": 1.9453125, "grad_norm_var": 0.004870351155598958, "learning_rate": 0.0001, "loss": 4.2774, "loss/crossentropy": 2.1415608525276184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21857701241970062, "step": 11488 }, { "epoch": 0.2298, "grad_norm": 2.03125, "grad_norm_var": 0.005191802978515625, "learning_rate": 0.0001, "loss": 4.0697, "loss/crossentropy": 1.9523325562477112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19859597831964493, "step": 11490 }, { "epoch": 0.22984, "grad_norm": 2.0, "grad_norm_var": 0.0046770731608072914, "learning_rate": 0.0001, "loss": 4.2433, "loss/crossentropy": 2.1532927751541138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239024043083191, "step": 11492 }, { "epoch": 0.22988, "grad_norm": 2.1875, "grad_norm_var": 0.006705474853515625, "learning_rate": 0.0001, "loss": 4.3556, "loss/crossentropy": 2.160528779029846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23207154870033264, "step": 11494 }, { "epoch": 0.22992, "grad_norm": 2.078125, "grad_norm_var": 0.006528472900390625, "learning_rate": 0.0001, "loss": 4.277, "loss/crossentropy": 2.07854962348938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2189328968524933, "step": 11496 }, { "epoch": 0.22996, "grad_norm": 2.296875, "grad_norm_var": 0.009069569905598958, "learning_rate": 0.0001, "loss": 4.4237, "loss/crossentropy": 2.2270501852035522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23237968981266022, "step": 11498 }, { "epoch": 0.23, "grad_norm": 1.9609375, "grad_norm_var": 0.0125396728515625, "learning_rate": 0.0001, "loss": 3.96, "loss/crossentropy": 1.9641701579093933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063433900475502, "step": 11500 }, { "epoch": 0.23004, "grad_norm": 2.0625, "grad_norm_var": 0.016866048177083332, "learning_rate": 0.0001, "loss": 4.3002, "loss/crossentropy": 1.9243032932281494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20478814095258713, "step": 11502 }, { "epoch": 0.23008, "grad_norm": 2.140625, "grad_norm_var": 0.01587092081705729, "learning_rate": 0.0001, "loss": 4.329, "loss/crossentropy": 2.295292854309082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23184886574745178, "step": 11504 }, { "epoch": 0.23012, "grad_norm": 2.046875, "grad_norm_var": 0.01654052734375, "learning_rate": 0.0001, "loss": 4.0697, "loss/crossentropy": 2.134859561920166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21923956274986267, "step": 11506 }, { "epoch": 0.23016, "grad_norm": 2.109375, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 4.2523, "loss/crossentropy": 1.905085265636444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22443564236164093, "step": 11508 }, { "epoch": 0.2302, "grad_norm": 2.03125, "grad_norm_var": 0.014892323811848959, "learning_rate": 0.0001, "loss": 4.0296, "loss/crossentropy": 1.6967324614524841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19518497586250305, "step": 11510 }, { "epoch": 0.23024, "grad_norm": 1.9609375, "grad_norm_var": 0.015364583333333333, "learning_rate": 0.0001, "loss": 4.225, "loss/crossentropy": 1.638957679271698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19098830223083496, "step": 11512 }, { "epoch": 0.23028, "grad_norm": 2.078125, "grad_norm_var": 0.013732655843098959, "learning_rate": 0.0001, "loss": 4.0479, "loss/crossentropy": 1.8760477900505066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202397421002388, "step": 11514 }, { "epoch": 0.23032, "grad_norm": 2.03125, "grad_norm_var": 0.012189737955729167, "learning_rate": 0.0001, "loss": 4.2265, "loss/crossentropy": 2.201690912246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22841450572013855, "step": 11516 }, { "epoch": 0.23036, "grad_norm": 2.203125, "grad_norm_var": 0.007987467447916667, "learning_rate": 0.0001, "loss": 4.5451, "loss/crossentropy": 2.5022183656692505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25180216133594513, "step": 11518 }, { "epoch": 0.2304, "grad_norm": 2.0, "grad_norm_var": 0.007682291666666666, "learning_rate": 0.0001, "loss": 4.2848, "loss/crossentropy": 2.4634610414505005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23911123722791672, "step": 11520 }, { "epoch": 0.23044, "grad_norm": 2.046875, "grad_norm_var": 0.007503000895182291, "learning_rate": 0.0001, "loss": 4.2919, "loss/crossentropy": 2.1176512241363525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20835164189338684, "step": 11522 }, { "epoch": 0.23048, "grad_norm": 2.0625, "grad_norm_var": 0.007252756754557292, "learning_rate": 0.0001, "loss": 4.0803, "loss/crossentropy": 1.7708171606063843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18085652589797974, "step": 11524 }, { "epoch": 0.23052, "grad_norm": 2.09375, "grad_norm_var": 0.008504231770833334, "learning_rate": 0.0001, "loss": 4.1159, "loss/crossentropy": 1.6740695238113403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17464587092399597, "step": 11526 }, { "epoch": 0.23056, "grad_norm": 1.984375, "grad_norm_var": 0.008135732014973958, "learning_rate": 0.0001, "loss": 4.1829, "loss/crossentropy": 1.8153178691864014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19107046723365784, "step": 11528 }, { "epoch": 0.2306, "grad_norm": 1.9453125, "grad_norm_var": 0.006231435139973958, "learning_rate": 0.0001, "loss": 4.2074, "loss/crossentropy": 2.2833873629570007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19900934398174286, "step": 11530 }, { "epoch": 0.23064, "grad_norm": 2.03125, "grad_norm_var": 0.008593495686848958, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 1.9106029272079468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996043100953102, "step": 11532 }, { "epoch": 0.23068, "grad_norm": 2.109375, "grad_norm_var": 0.006030019124348958, "learning_rate": 0.0001, "loss": 4.3849, "loss/crossentropy": 2.049258530139923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438845992088318, "step": 11534 }, { "epoch": 0.23072, "grad_norm": 1.9765625, "grad_norm_var": 0.006029256184895833, "learning_rate": 0.0001, "loss": 3.9236, "loss/crossentropy": 1.9124351739883423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20111311972141266, "step": 11536 }, { "epoch": 0.23076, "grad_norm": 2.015625, "grad_norm_var": 0.0059397379557291664, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 1.8045400381088257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20846854895353317, "step": 11538 }, { "epoch": 0.2308, "grad_norm": 1.8984375, "grad_norm_var": 0.007297515869140625, "learning_rate": 0.0001, "loss": 4.1804, "loss/crossentropy": 1.9970109462738037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20165999233722687, "step": 11540 }, { "epoch": 0.23084, "grad_norm": 2.09375, "grad_norm_var": 0.013065338134765625, "learning_rate": 0.0001, "loss": 4.1756, "loss/crossentropy": 2.0300097465515137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24561113119125366, "step": 11542 }, { "epoch": 0.23088, "grad_norm": 2.5, "grad_norm_var": 0.025402577718098958, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 2.194391131401062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24075081944465637, "step": 11544 }, { "epoch": 0.23092, "grad_norm": 2.03125, "grad_norm_var": 0.0239898681640625, "learning_rate": 0.0001, "loss": 3.9177, "loss/crossentropy": 2.0229761600494385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21907109022140503, "step": 11546 }, { "epoch": 0.23096, "grad_norm": 2.171875, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 4.3089, "loss/crossentropy": 1.831793487071991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18943443894386292, "step": 11548 }, { "epoch": 0.231, "grad_norm": 1.984375, "grad_norm_var": 0.023395792643229166, "learning_rate": 0.0001, "loss": 4.19, "loss/crossentropy": 2.4099985361099243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22612392157316208, "step": 11550 }, { "epoch": 0.23104, "grad_norm": 2.109375, "grad_norm_var": 0.022946929931640624, "learning_rate": 0.0001, "loss": 4.2512, "loss/crossentropy": 2.0496281385421753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20928414165973663, "step": 11552 }, { "epoch": 0.23108, "grad_norm": 1.875, "grad_norm_var": 0.025986480712890624, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.9366755485534668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19255827367305756, "step": 11554 }, { "epoch": 0.23112, "grad_norm": 1.9296875, "grad_norm_var": 0.02516454060872396, "learning_rate": 0.0001, "loss": 4.3758, "loss/crossentropy": 1.9077317714691162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20340882241725922, "step": 11556 }, { "epoch": 0.23116, "grad_norm": 2.046875, "grad_norm_var": 0.02020848592122396, "learning_rate": 0.0001, "loss": 4.091, "loss/crossentropy": 1.9167283773422241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21566946804523468, "step": 11558 }, { "epoch": 0.2312, "grad_norm": 2.125, "grad_norm_var": 0.0059506734212239586, "learning_rate": 0.0001, "loss": 4.5758, "loss/crossentropy": 2.4604564905166626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22104239463806152, "step": 11560 }, { "epoch": 0.23124, "grad_norm": 1.828125, "grad_norm_var": 0.009124501546223959, "learning_rate": 0.0001, "loss": 4.0398, "loss/crossentropy": 2.359586775302887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064266949892044, "step": 11562 }, { "epoch": 0.23128, "grad_norm": 1.875, "grad_norm_var": 0.009456125895182292, "learning_rate": 0.0001, "loss": 4.0065, "loss/crossentropy": 1.9757064580917358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19224581122398376, "step": 11564 }, { "epoch": 0.23132, "grad_norm": 2.125, "grad_norm_var": 0.03843561808268229, "learning_rate": 0.0001, "loss": 4.5544, "loss/crossentropy": 1.9888432025909424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20799735933542252, "step": 11566 }, { "epoch": 0.23136, "grad_norm": 2.0625, "grad_norm_var": 0.03802057902018229, "learning_rate": 0.0001, "loss": 4.2281, "loss/crossentropy": 2.0829046964645386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22345586121082306, "step": 11568 }, { "epoch": 0.2314, "grad_norm": 2.203125, "grad_norm_var": 0.036649322509765624, "learning_rate": 0.0001, "loss": 4.3421, "loss/crossentropy": 1.964626431465149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21068184822797775, "step": 11570 }, { "epoch": 0.23144, "grad_norm": 2.125, "grad_norm_var": 0.034993489583333336, "learning_rate": 0.0001, "loss": 4.4206, "loss/crossentropy": 2.313928008079529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23291154205799103, "step": 11572 }, { "epoch": 0.23148, "grad_norm": 1.890625, "grad_norm_var": 0.038386027018229164, "learning_rate": 0.0001, "loss": 3.9609, "loss/crossentropy": 2.221343159675598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20674917846918106, "step": 11574 }, { "epoch": 0.23152, "grad_norm": 2.0625, "grad_norm_var": 0.03853759765625, "learning_rate": 0.0001, "loss": 4.4097, "loss/crossentropy": 2.070296823978424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22503886371850967, "step": 11576 }, { "epoch": 0.23156, "grad_norm": 2.09375, "grad_norm_var": 0.03483784993489583, "learning_rate": 0.0001, "loss": 3.9417, "loss/crossentropy": 2.21540367603302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2334480583667755, "step": 11578 }, { "epoch": 0.2316, "grad_norm": 1.8359375, "grad_norm_var": 0.03614679972330729, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 1.9438464641571045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20312584936618805, "step": 11580 }, { "epoch": 0.23164, "grad_norm": 1.90625, "grad_norm_var": 0.010994211832682291, "learning_rate": 0.0001, "loss": 4.0622, "loss/crossentropy": 1.4970324039459229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17415452748537064, "step": 11582 }, { "epoch": 0.23168, "grad_norm": 2.515625, "grad_norm_var": 0.026244099934895834, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.2618579864501953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066900670528412, "step": 11584 }, { "epoch": 0.23172, "grad_norm": 1.9453125, "grad_norm_var": 0.02603123982747396, "learning_rate": 0.0001, "loss": 4.0974, "loss/crossentropy": 2.2342761754989624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20567959547042847, "step": 11586 }, { "epoch": 0.23176, "grad_norm": 2.03125, "grad_norm_var": 0.026151275634765624, "learning_rate": 0.0001, "loss": 4.0417, "loss/crossentropy": 2.144785463809967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19473369419574738, "step": 11588 }, { "epoch": 0.2318, "grad_norm": 2.015625, "grad_norm_var": 0.02851130167643229, "learning_rate": 0.0001, "loss": 4.1996, "loss/crossentropy": 1.9032491445541382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956048086285591, "step": 11590 }, { "epoch": 0.23184, "grad_norm": 2.234375, "grad_norm_var": 0.03227717081705729, "learning_rate": 0.0001, "loss": 4.4689, "loss/crossentropy": 1.934233546257019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426530718803406, "step": 11592 }, { "epoch": 0.23188, "grad_norm": 2.0, "grad_norm_var": 0.03166071573893229, "learning_rate": 0.0001, "loss": 4.2321, "loss/crossentropy": 2.1792030930519104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22469744831323624, "step": 11594 }, { "epoch": 0.23192, "grad_norm": 2.109375, "grad_norm_var": 0.027705891927083334, "learning_rate": 0.0001, "loss": 4.3779, "loss/crossentropy": 2.3242534399032593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23015478998422623, "step": 11596 }, { "epoch": 0.23196, "grad_norm": 2.5, "grad_norm_var": 0.036622873942057294, "learning_rate": 0.0001, "loss": 4.3553, "loss/crossentropy": 2.2343804836273193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2392263486981392, "step": 11598 }, { "epoch": 0.232, "grad_norm": 2.171875, "grad_norm_var": 0.024559529622395833, "learning_rate": 0.0001, "loss": 4.3019, "loss/crossentropy": 1.9622855186462402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22550886124372482, "step": 11600 }, { "epoch": 0.23204, "grad_norm": 1.9140625, "grad_norm_var": 0.023981730143229168, "learning_rate": 0.0001, "loss": 4.4955, "loss/crossentropy": 2.2274144887924194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21090717613697052, "step": 11602 }, { "epoch": 0.23208, "grad_norm": 2.078125, "grad_norm_var": 0.021683756510416666, "learning_rate": 0.0001, "loss": 4.3011, "loss/crossentropy": 2.092648506164551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21386967599391937, "step": 11604 }, { "epoch": 0.23212, "grad_norm": 1.9609375, "grad_norm_var": 0.020643870035807293, "learning_rate": 0.0001, "loss": 4.2739, "loss/crossentropy": 2.3358936309814453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22385042905807495, "step": 11606 }, { "epoch": 0.23216, "grad_norm": 2.046875, "grad_norm_var": 0.01883112589518229, "learning_rate": 0.0001, "loss": 4.1968, "loss/crossentropy": 1.9800407886505127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614364743232727, "step": 11608 }, { "epoch": 0.2322, "grad_norm": 2.140625, "grad_norm_var": 0.018536122639973958, "learning_rate": 0.0001, "loss": 4.2385, "loss/crossentropy": 1.9646947979927063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21902073919773102, "step": 11610 }, { "epoch": 0.23224, "grad_norm": 2.09375, "grad_norm_var": 0.018930816650390626, "learning_rate": 0.0001, "loss": 4.1564, "loss/crossentropy": 2.0113691687583923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21262314170598984, "step": 11612 }, { "epoch": 0.23228, "grad_norm": 2.328125, "grad_norm_var": 0.0254791259765625, "learning_rate": 0.0001, "loss": 4.3179, "loss/crossentropy": 1.8359833359718323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125518098473549, "step": 11614 }, { "epoch": 0.23232, "grad_norm": 2.15625, "grad_norm_var": 0.02577489217122396, "learning_rate": 0.0001, "loss": 4.0954, "loss/crossentropy": 1.934333622455597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148296758532524, "step": 11616 }, { "epoch": 0.23236, "grad_norm": 2.359375, "grad_norm_var": 0.026192220052083333, "learning_rate": 0.0001, "loss": 4.3556, "loss/crossentropy": 1.9486380815505981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.201569102704525, "step": 11618 }, { "epoch": 0.2324, "grad_norm": 2.140625, "grad_norm_var": 0.0251708984375, "learning_rate": 0.0001, "loss": 4.3761, "loss/crossentropy": 1.9379103183746338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109408900141716, "step": 11620 }, { "epoch": 0.23244, "grad_norm": 2.203125, "grad_norm_var": 0.02165705362955729, "learning_rate": 0.0001, "loss": 4.3957, "loss/crossentropy": 2.1635884046554565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23330900818109512, "step": 11622 }, { "epoch": 0.23248, "grad_norm": 1.96875, "grad_norm_var": 0.02851130167643229, "learning_rate": 0.0001, "loss": 4.1035, "loss/crossentropy": 2.3183244466781616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21685285866260529, "step": 11624 }, { "epoch": 0.23252, "grad_norm": 2.0, "grad_norm_var": 0.03022028605143229, "learning_rate": 0.0001, "loss": 3.9465, "loss/crossentropy": 2.2932451367378235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22147410362958908, "step": 11626 }, { "epoch": 0.23256, "grad_norm": 1.953125, "grad_norm_var": 0.03144505818684896, "learning_rate": 0.0001, "loss": 4.0598, "loss/crossentropy": 2.1384140253067017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22141631692647934, "step": 11628 }, { "epoch": 0.2326, "grad_norm": 2.0, "grad_norm_var": 0.0146728515625, "learning_rate": 0.0001, "loss": 3.9313, "loss/crossentropy": 1.7338963747024536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18959754705429077, "step": 11630 }, { "epoch": 0.23264, "grad_norm": 2.046875, "grad_norm_var": 0.014943186442057292, "learning_rate": 0.0001, "loss": 4.3293, "loss/crossentropy": 2.186310887336731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19735413044691086, "step": 11632 }, { "epoch": 0.23268, "grad_norm": 1.8671875, "grad_norm_var": 0.01092529296875, "learning_rate": 0.0001, "loss": 3.9636, "loss/crossentropy": 2.0566734075546265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21930715441703796, "step": 11634 }, { "epoch": 0.23272, "grad_norm": 1.984375, "grad_norm_var": 0.011631011962890625, "learning_rate": 0.0001, "loss": 3.8382, "loss/crossentropy": 1.9993655681610107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18999667465686798, "step": 11636 }, { "epoch": 0.23276, "grad_norm": 2.015625, "grad_norm_var": 0.008316802978515624, "learning_rate": 0.0001, "loss": 4.2309, "loss/crossentropy": 2.2788418531417847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21202587336301804, "step": 11638 }, { "epoch": 0.2328, "grad_norm": 1.8828125, "grad_norm_var": 0.00792236328125, "learning_rate": 0.0001, "loss": 3.7837, "loss/crossentropy": 1.7248046398162842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21424376964569092, "step": 11640 }, { "epoch": 0.23284, "grad_norm": 1.8984375, "grad_norm_var": 0.008235422770182292, "learning_rate": 0.0001, "loss": 4.0023, "loss/crossentropy": 1.9053270816802979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19046200066804886, "step": 11642 }, { "epoch": 0.23288, "grad_norm": 1.9765625, "grad_norm_var": 0.0118408203125, "learning_rate": 0.0001, "loss": 4.3561, "loss/crossentropy": 2.3588117361068726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23569310456514359, "step": 11644 }, { "epoch": 0.23292, "grad_norm": 2.078125, "grad_norm_var": 0.012271881103515625, "learning_rate": 0.0001, "loss": 4.4722, "loss/crossentropy": 2.15751576423645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22107885777950287, "step": 11646 }, { "epoch": 0.23296, "grad_norm": 2.03125, "grad_norm_var": 0.009720611572265624, "learning_rate": 0.0001, "loss": 4.4075, "loss/crossentropy": 2.094591200351715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20681703090667725, "step": 11648 }, { "epoch": 0.233, "grad_norm": 2.03125, "grad_norm_var": 0.009523264567057292, "learning_rate": 0.0001, "loss": 3.7832, "loss/crossentropy": 1.7536470890045166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18886109441518784, "step": 11650 }, { "epoch": 0.23304, "grad_norm": 2.0625, "grad_norm_var": 0.0082427978515625, "learning_rate": 0.0001, "loss": 3.9905, "loss/crossentropy": 1.8172362446784973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18359588831663132, "step": 11652 }, { "epoch": 0.23308, "grad_norm": 2.0625, "grad_norm_var": 0.00841064453125, "learning_rate": 0.0001, "loss": 4.2404, "loss/crossentropy": 2.2346678376197815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2154158428311348, "step": 11654 }, { "epoch": 0.23312, "grad_norm": 2.03125, "grad_norm_var": 0.008040110270182291, "learning_rate": 0.0001, "loss": 4.1559, "loss/crossentropy": 2.0573307275772095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160855233669281, "step": 11656 }, { "epoch": 0.23316, "grad_norm": 2.03125, "grad_norm_var": 0.007621256510416666, "learning_rate": 0.0001, "loss": 4.0669, "loss/crossentropy": 2.4358904361724854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22637036442756653, "step": 11658 }, { "epoch": 0.2332, "grad_norm": 1.96875, "grad_norm_var": 0.004644521077473958, "learning_rate": 0.0001, "loss": 3.9938, "loss/crossentropy": 2.387966513633728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2385111078619957, "step": 11660 }, { "epoch": 0.23324, "grad_norm": 2.046875, "grad_norm_var": 0.004709625244140625, "learning_rate": 0.0001, "loss": 4.3371, "loss/crossentropy": 2.30223548412323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149546965956688, "step": 11662 }, { "epoch": 0.23328, "grad_norm": 2.109375, "grad_norm_var": 0.005494944254557292, "learning_rate": 0.0001, "loss": 4.3242, "loss/crossentropy": 2.001839280128479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001148834824562, "step": 11664 }, { "epoch": 0.23332, "grad_norm": 2.25, "grad_norm_var": 0.00693359375, "learning_rate": 0.0001, "loss": 4.4932, "loss/crossentropy": 2.387674927711487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22714952379465103, "step": 11666 }, { "epoch": 0.23336, "grad_norm": 1.9296875, "grad_norm_var": 0.007470448811848958, "learning_rate": 0.0001, "loss": 4.1333, "loss/crossentropy": 2.133235454559326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22263716161251068, "step": 11668 }, { "epoch": 0.2334, "grad_norm": 1.90625, "grad_norm_var": 0.008874257405598959, "learning_rate": 0.0001, "loss": 4.1392, "loss/crossentropy": 2.1041141748428345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20474642515182495, "step": 11670 }, { "epoch": 0.23344, "grad_norm": 2.25, "grad_norm_var": 0.010796864827473959, "learning_rate": 0.0001, "loss": 4.4075, "loss/crossentropy": 2.0425861477851868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23144961893558502, "step": 11672 }, { "epoch": 0.23348, "grad_norm": 1.9921875, "grad_norm_var": 0.009883626302083334, "learning_rate": 0.0001, "loss": 4.0878, "loss/crossentropy": 2.0170212388038635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134404480457306, "step": 11674 }, { "epoch": 0.23352, "grad_norm": 2.015625, "grad_norm_var": 0.010489908854166667, "learning_rate": 0.0001, "loss": 4.2275, "loss/crossentropy": 2.205981433391571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220575213432312, "step": 11676 }, { "epoch": 0.23356, "grad_norm": 1.9765625, "grad_norm_var": 0.011429595947265624, "learning_rate": 0.0001, "loss": 4.1845, "loss/crossentropy": 1.9121403694152832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23312295228242874, "step": 11678 }, { "epoch": 0.2336, "grad_norm": 2.296875, "grad_norm_var": 0.0156890869140625, "learning_rate": 0.0001, "loss": 4.1112, "loss/crossentropy": 1.6793898940086365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19598360359668732, "step": 11680 }, { "epoch": 0.23364, "grad_norm": 2.359375, "grad_norm_var": 0.0222564697265625, "learning_rate": 0.0001, "loss": 4.4814, "loss/crossentropy": 2.1657907962799072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2281472533941269, "step": 11682 }, { "epoch": 0.23368, "grad_norm": 2.015625, "grad_norm_var": 0.022027333577473957, "learning_rate": 0.0001, "loss": 4.0869, "loss/crossentropy": 2.1918715238571167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22837525606155396, "step": 11684 }, { "epoch": 0.23372, "grad_norm": 1.9765625, "grad_norm_var": 0.020699055989583333, "learning_rate": 0.0001, "loss": 4.27, "loss/crossentropy": 2.296495795249939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23491691797971725, "step": 11686 }, { "epoch": 0.23376, "grad_norm": 2.1875, "grad_norm_var": 0.019559733072916665, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 1.9278987646102905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960090771317482, "step": 11688 }, { "epoch": 0.2338, "grad_norm": 2.0625, "grad_norm_var": 0.019461822509765626, "learning_rate": 0.0001, "loss": 4.3393, "loss/crossentropy": 2.0491825938224792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21345915645360947, "step": 11690 }, { "epoch": 0.23384, "grad_norm": 2.015625, "grad_norm_var": 0.02127863566080729, "learning_rate": 0.0001, "loss": 3.9209, "loss/crossentropy": 1.7594041228294373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19004464149475098, "step": 11692 }, { "epoch": 0.23388, "grad_norm": 1.9765625, "grad_norm_var": 0.020961252848307292, "learning_rate": 0.0001, "loss": 4.3645, "loss/crossentropy": 2.0313411951065063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20986144989728928, "step": 11694 }, { "epoch": 0.23392, "grad_norm": 2.109375, "grad_norm_var": 0.016068522135416666, "learning_rate": 0.0001, "loss": 4.4644, "loss/crossentropy": 2.3580493927001953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2253532111644745, "step": 11696 }, { "epoch": 0.23396, "grad_norm": 2.203125, "grad_norm_var": 0.008548990885416666, "learning_rate": 0.0001, "loss": 4.3954, "loss/crossentropy": 2.2036253213882446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22294757515192032, "step": 11698 }, { "epoch": 0.234, "grad_norm": 2.03125, "grad_norm_var": 0.007972971598307291, "learning_rate": 0.0001, "loss": 4.2602, "loss/crossentropy": 1.8575093150138855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19865535199642181, "step": 11700 }, { "epoch": 0.23404, "grad_norm": 1.96875, "grad_norm_var": 0.008571116129557292, "learning_rate": 0.0001, "loss": 4.2619, "loss/crossentropy": 2.2169028520584106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22326484322547913, "step": 11702 }, { "epoch": 0.23408, "grad_norm": 2.015625, "grad_norm_var": 0.007500966389973958, "learning_rate": 0.0001, "loss": 4.4045, "loss/crossentropy": 2.1897542476654053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097000628709793, "step": 11704 }, { "epoch": 0.23412, "grad_norm": 2.015625, "grad_norm_var": 0.0063168843587239586, "learning_rate": 0.0001, "loss": 4.271, "loss/crossentropy": 2.214607834815979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24112706631422043, "step": 11706 }, { "epoch": 0.23416, "grad_norm": 2.046875, "grad_norm_var": 0.004780832926432292, "learning_rate": 0.0001, "loss": 4.1278, "loss/crossentropy": 2.124355912208557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222435548901558, "step": 11708 }, { "epoch": 0.2342, "grad_norm": 2.171875, "grad_norm_var": 0.0051025390625, "learning_rate": 0.0001, "loss": 4.3118, "loss/crossentropy": 2.35421621799469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26120802760124207, "step": 11710 }, { "epoch": 0.23424, "grad_norm": 2.0, "grad_norm_var": 0.00521240234375, "learning_rate": 0.0001, "loss": 4.1781, "loss/crossentropy": 1.9134620428085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830002963542938, "step": 11712 }, { "epoch": 0.23428, "grad_norm": 2.078125, "grad_norm_var": 0.003763580322265625, "learning_rate": 0.0001, "loss": 4.2445, "loss/crossentropy": 1.9336887001991272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025599479675293, "step": 11714 }, { "epoch": 0.23432, "grad_norm": 2.015625, "grad_norm_var": 0.004133097330729167, "learning_rate": 0.0001, "loss": 4.0775, "loss/crossentropy": 1.9964489936828613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020488828420639, "step": 11716 }, { "epoch": 0.23436, "grad_norm": 2.078125, "grad_norm_var": 0.00423583984375, "learning_rate": 0.0001, "loss": 4.264, "loss/crossentropy": 2.108368992805481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20985107123851776, "step": 11718 }, { "epoch": 0.2344, "grad_norm": 1.9453125, "grad_norm_var": 0.005008697509765625, "learning_rate": 0.0001, "loss": 3.9458, "loss/crossentropy": 2.0314669013023376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20942936092615128, "step": 11720 }, { "epoch": 0.23444, "grad_norm": 1.9921875, "grad_norm_var": 0.0054107666015625, "learning_rate": 0.0001, "loss": 4.1098, "loss/crossentropy": 2.343130350112915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.237278014421463, "step": 11722 }, { "epoch": 0.23448, "grad_norm": 1.9609375, "grad_norm_var": 0.010658518473307291, "learning_rate": 0.0001, "loss": 4.2497, "loss/crossentropy": 2.1542173624038696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22762110829353333, "step": 11724 }, { "epoch": 0.23452, "grad_norm": 1.9609375, "grad_norm_var": 0.009395090738932292, "learning_rate": 0.0001, "loss": 3.9056, "loss/crossentropy": 1.638447105884552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1707247570157051, "step": 11726 }, { "epoch": 0.23456, "grad_norm": 2.09375, "grad_norm_var": 0.010573069254557291, "learning_rate": 0.0001, "loss": 4.0939, "loss/crossentropy": 2.1509228944778442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091827318072319, "step": 11728 }, { "epoch": 0.2346, "grad_norm": 1.9609375, "grad_norm_var": 0.011677805582682292, "learning_rate": 0.0001, "loss": 4.325, "loss/crossentropy": 2.193585455417633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2697590962052345, "step": 11730 }, { "epoch": 0.23464, "grad_norm": 2.109375, "grad_norm_var": 0.012648264567057291, "learning_rate": 0.0001, "loss": 4.2239, "loss/crossentropy": 2.2058298587799072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21958298981189728, "step": 11732 }, { "epoch": 0.23468, "grad_norm": 2.078125, "grad_norm_var": 0.012894439697265624, "learning_rate": 0.0001, "loss": 4.3269, "loss/crossentropy": 2.0816246271133423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916947722434998, "step": 11734 }, { "epoch": 0.23472, "grad_norm": 2.03125, "grad_norm_var": 0.0150543212890625, "learning_rate": 0.0001, "loss": 4.1495, "loss/crossentropy": 1.838355541229248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20002590864896774, "step": 11736 }, { "epoch": 0.23476, "grad_norm": 1.9765625, "grad_norm_var": 0.016795857747395834, "learning_rate": 0.0001, "loss": 4.0559, "loss/crossentropy": 2.1602721214294434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21018436551094055, "step": 11738 }, { "epoch": 0.2348, "grad_norm": 2.03125, "grad_norm_var": 0.013181304931640625, "learning_rate": 0.0001, "loss": 4.5376, "loss/crossentropy": 2.6967735290527344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23233920335769653, "step": 11740 }, { "epoch": 0.23484, "grad_norm": 2.03125, "grad_norm_var": 0.012715403238932292, "learning_rate": 0.0001, "loss": 3.9772, "loss/crossentropy": 2.033313810825348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21240226924419403, "step": 11742 }, { "epoch": 0.23488, "grad_norm": 1.96875, "grad_norm_var": 0.012245432535807291, "learning_rate": 0.0001, "loss": 4.21, "loss/crossentropy": 2.123607873916626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207147017121315, "step": 11744 }, { "epoch": 0.23492, "grad_norm": 1.8125, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 4.1121, "loss/crossentropy": 2.1173813343048096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21387682110071182, "step": 11746 }, { "epoch": 0.23496, "grad_norm": 1.9296875, "grad_norm_var": 0.011456044514973958, "learning_rate": 0.0001, "loss": 4.1219, "loss/crossentropy": 2.27209734916687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2254154533147812, "step": 11748 }, { "epoch": 0.235, "grad_norm": 2.0625, "grad_norm_var": 0.011156209309895833, "learning_rate": 0.0001, "loss": 4.3138, "loss/crossentropy": 1.844546616077423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1740722581744194, "step": 11750 }, { "epoch": 0.23504, "grad_norm": 2.078125, "grad_norm_var": 0.009346516927083333, "learning_rate": 0.0001, "loss": 4.4615, "loss/crossentropy": 2.341967821121216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23638595640659332, "step": 11752 }, { "epoch": 0.23508, "grad_norm": 2.09375, "grad_norm_var": 0.009354400634765624, "learning_rate": 0.0001, "loss": 4.4113, "loss/crossentropy": 2.2561213970184326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23169096559286118, "step": 11754 }, { "epoch": 0.23512, "grad_norm": 1.96875, "grad_norm_var": 0.007972971598307291, "learning_rate": 0.0001, "loss": 4.1603, "loss/crossentropy": 1.8693158030509949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19997497648000717, "step": 11756 }, { "epoch": 0.23516, "grad_norm": 1.953125, "grad_norm_var": 0.0085845947265625, "learning_rate": 0.0001, "loss": 3.9721, "loss/crossentropy": 2.2228434085845947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19779722392559052, "step": 11758 }, { "epoch": 0.2352, "grad_norm": 1.9609375, "grad_norm_var": 0.007899729410807292, "learning_rate": 0.0001, "loss": 4.2572, "loss/crossentropy": 2.177064299583435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21091558039188385, "step": 11760 }, { "epoch": 0.23524, "grad_norm": 1.984375, "grad_norm_var": 0.005020904541015625, "learning_rate": 0.0001, "loss": 4.398, "loss/crossentropy": 2.2667617797851562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2120228409767151, "step": 11762 }, { "epoch": 0.23528, "grad_norm": 2.109375, "grad_norm_var": 0.004713694254557292, "learning_rate": 0.0001, "loss": 4.198, "loss/crossentropy": 2.0310307145118713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194480448961258, "step": 11764 }, { "epoch": 0.23532, "grad_norm": 1.953125, "grad_norm_var": 0.005033111572265625, "learning_rate": 0.0001, "loss": 3.9906, "loss/crossentropy": 1.8481100797653198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20181456953287125, "step": 11766 }, { "epoch": 0.23536, "grad_norm": 2.140625, "grad_norm_var": 0.008642323811848958, "learning_rate": 0.0001, "loss": 4.3486, "loss/crossentropy": 2.254691958427429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26685942709445953, "step": 11768 }, { "epoch": 0.2354, "grad_norm": 1.921875, "grad_norm_var": 0.007993316650390625, "learning_rate": 0.0001, "loss": 4.144, "loss/crossentropy": 2.112728714942932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19724663347005844, "step": 11770 }, { "epoch": 0.23544, "grad_norm": 2.265625, "grad_norm_var": 0.011195627848307292, "learning_rate": 0.0001, "loss": 4.089, "loss/crossentropy": 2.0724143981933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20518633723258972, "step": 11772 }, { "epoch": 0.23548, "grad_norm": 2.015625, "grad_norm_var": 0.010064442952473959, "learning_rate": 0.0001, "loss": 4.1882, "loss/crossentropy": 2.0743810534477234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181471511721611, "step": 11774 }, { "epoch": 0.23552, "grad_norm": 1.9296875, "grad_norm_var": 0.011740875244140626, "learning_rate": 0.0001, "loss": 4.1121, "loss/crossentropy": 2.02871835231781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22081361711025238, "step": 11776 }, { "epoch": 0.23556, "grad_norm": 2.15625, "grad_norm_var": 0.013185373942057292, "learning_rate": 0.0001, "loss": 4.5607, "loss/crossentropy": 2.1898428201675415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22084421664476395, "step": 11778 }, { "epoch": 0.2356, "grad_norm": 2.0625, "grad_norm_var": 0.012748209635416667, "learning_rate": 0.0001, "loss": 4.2404, "loss/crossentropy": 2.0893847346305847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616562455892563, "step": 11780 }, { "epoch": 0.23564, "grad_norm": 1.9921875, "grad_norm_var": 0.011769358317057292, "learning_rate": 0.0001, "loss": 4.215, "loss/crossentropy": 1.8590916991233826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18936381489038467, "step": 11782 }, { "epoch": 0.23568, "grad_norm": 2.296875, "grad_norm_var": 0.012741851806640624, "learning_rate": 0.0001, "loss": 3.9544, "loss/crossentropy": 2.105339765548706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22275932878255844, "step": 11784 }, { "epoch": 0.23572, "grad_norm": 2.140625, "grad_norm_var": 0.011987050374348959, "learning_rate": 0.0001, "loss": 4.2585, "loss/crossentropy": 2.1720080375671387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20734921097755432, "step": 11786 }, { "epoch": 0.23576, "grad_norm": 1.984375, "grad_norm_var": 0.012640126546223958, "learning_rate": 0.0001, "loss": 4.2715, "loss/crossentropy": 2.2301958799362183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21865415573120117, "step": 11788 }, { "epoch": 0.2358, "grad_norm": 2.1875, "grad_norm_var": 0.013626861572265624, "learning_rate": 0.0001, "loss": 3.7518, "loss/crossentropy": 1.588155210018158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19334131479263306, "step": 11790 }, { "epoch": 0.23584, "grad_norm": 2.046875, "grad_norm_var": 0.025248209635416668, "learning_rate": 0.0001, "loss": 4.4052, "loss/crossentropy": 2.208239734172821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21055688709020615, "step": 11792 }, { "epoch": 0.23588, "grad_norm": 1.9140625, "grad_norm_var": 0.030368804931640625, "learning_rate": 0.0001, "loss": 4.1996, "loss/crossentropy": 2.46909761428833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.225817009806633, "step": 11794 }, { "epoch": 0.23592, "grad_norm": 1.984375, "grad_norm_var": 0.03367691040039063, "learning_rate": 0.0001, "loss": 4.2792, "loss/crossentropy": 2.022938549518585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21232923865318298, "step": 11796 }, { "epoch": 0.23596, "grad_norm": 1.9765625, "grad_norm_var": 0.03394953409830729, "learning_rate": 0.0001, "loss": 4.3949, "loss/crossentropy": 2.333058714866638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23842789232730865, "step": 11798 }, { "epoch": 0.236, "grad_norm": 2.015625, "grad_norm_var": 0.0366363525390625, "learning_rate": 0.0001, "loss": 3.8698, "loss/crossentropy": 1.9570570588111877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20660093426704407, "step": 11800 }, { "epoch": 0.23604, "grad_norm": 2.0, "grad_norm_var": 0.0335601806640625, "learning_rate": 0.0001, "loss": 4.3649, "loss/crossentropy": 2.2074697017669678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23766764998435974, "step": 11802 }, { "epoch": 0.23608, "grad_norm": 1.9921875, "grad_norm_var": 0.03026301066080729, "learning_rate": 0.0001, "loss": 4.4127, "loss/crossentropy": 2.327863335609436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22967173904180527, "step": 11804 }, { "epoch": 0.23612, "grad_norm": 2.03125, "grad_norm_var": 0.02802734375, "learning_rate": 0.0001, "loss": 4.3367, "loss/crossentropy": 2.135041356086731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21708428114652634, "step": 11806 }, { "epoch": 0.23616, "grad_norm": 2.078125, "grad_norm_var": 0.00665283203125, "learning_rate": 0.0001, "loss": 3.9805, "loss/crossentropy": 1.9111011624336243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20122328400611877, "step": 11808 }, { "epoch": 0.2362, "grad_norm": 1.984375, "grad_norm_var": 0.006436920166015625, "learning_rate": 0.0001, "loss": 4.19, "loss/crossentropy": 2.0823878049850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23593349009752274, "step": 11810 }, { "epoch": 0.23624, "grad_norm": 2.078125, "grad_norm_var": 0.0069000244140625, "learning_rate": 0.0001, "loss": 3.9554, "loss/crossentropy": 2.298948645591736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22076356410980225, "step": 11812 }, { "epoch": 0.23628, "grad_norm": 1.96875, "grad_norm_var": 0.006483713785807292, "learning_rate": 0.0001, "loss": 4.1119, "loss/crossentropy": 2.383033037185669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20850034803152084, "step": 11814 }, { "epoch": 0.23632, "grad_norm": 2.109375, "grad_norm_var": 0.10857645670572917, "learning_rate": 0.0001, "loss": 4.3648, "loss/crossentropy": 2.004193425178528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27853623032569885, "step": 11816 }, { "epoch": 0.23636, "grad_norm": 2.515625, "grad_norm_var": 0.11877339680989583, "learning_rate": 0.0001, "loss": 4.4454, "loss/crossentropy": 1.9110660552978516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773407608270645, "step": 11818 }, { "epoch": 0.2364, "grad_norm": 2.140625, "grad_norm_var": 0.11894505818684896, "learning_rate": 0.0001, "loss": 4.5428, "loss/crossentropy": 2.300672471523285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23710736632347107, "step": 11820 }, { "epoch": 0.23644, "grad_norm": 1.984375, "grad_norm_var": 0.11943333943684896, "learning_rate": 0.0001, "loss": 4.1413, "loss/crossentropy": 2.270769238471985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2182588130235672, "step": 11822 }, { "epoch": 0.23648, "grad_norm": 2.078125, "grad_norm_var": 0.11885960896809895, "learning_rate": 0.0001, "loss": 4.2744, "loss/crossentropy": 2.085016667842865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19494660943746567, "step": 11824 }, { "epoch": 0.23652, "grad_norm": 2.078125, "grad_norm_var": 0.11644261678059896, "learning_rate": 0.0001, "loss": 4.5554, "loss/crossentropy": 2.384607672691345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22075054794549942, "step": 11826 }, { "epoch": 0.23656, "grad_norm": 1.8984375, "grad_norm_var": 0.11926167805989583, "learning_rate": 0.0001, "loss": 3.9076, "loss/crossentropy": 2.1217936277389526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126227766275406, "step": 11828 }, { "epoch": 0.2366, "grad_norm": 1.875, "grad_norm_var": 0.12241185506184896, "learning_rate": 0.0001, "loss": 3.8285, "loss/crossentropy": 2.1690168380737305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20450890809297562, "step": 11830 }, { "epoch": 0.23664, "grad_norm": 1.9765625, "grad_norm_var": 0.031998697916666666, "learning_rate": 0.0001, "loss": 4.2817, "loss/crossentropy": 2.2614429593086243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22817185521125793, "step": 11832 }, { "epoch": 0.23668, "grad_norm": 1.9296875, "grad_norm_var": 0.023545074462890624, "learning_rate": 0.0001, "loss": 4.3074, "loss/crossentropy": 2.177332043647766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.226592555642128, "step": 11834 }, { "epoch": 0.23672, "grad_norm": 1.859375, "grad_norm_var": 0.01765925089518229, "learning_rate": 0.0001, "loss": 4.0471, "loss/crossentropy": 1.947661578655243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1762884557247162, "step": 11836 }, { "epoch": 0.23676, "grad_norm": 2.015625, "grad_norm_var": 0.018184153238932292, "learning_rate": 0.0001, "loss": 3.9014, "loss/crossentropy": 1.7112661004066467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18696465343236923, "step": 11838 }, { "epoch": 0.2368, "grad_norm": 2.078125, "grad_norm_var": 0.017439524332682293, "learning_rate": 0.0001, "loss": 4.1761, "loss/crossentropy": 2.0478790402412415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21436551213264465, "step": 11840 }, { "epoch": 0.23684, "grad_norm": 2.078125, "grad_norm_var": 0.01718724568684896, "learning_rate": 0.0001, "loss": 4.1702, "loss/crossentropy": 2.0541738867759705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23121927678585052, "step": 11842 }, { "epoch": 0.23688, "grad_norm": 2.109375, "grad_norm_var": 0.01876805623372396, "learning_rate": 0.0001, "loss": 4.208, "loss/crossentropy": 2.04353004693985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22237974405288696, "step": 11844 }, { "epoch": 0.23692, "grad_norm": 2.1875, "grad_norm_var": 0.017488606770833335, "learning_rate": 0.0001, "loss": 4.4641, "loss/crossentropy": 2.110148549079895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23324476927518845, "step": 11846 }, { "epoch": 0.23696, "grad_norm": 1.9140625, "grad_norm_var": 0.018623860677083333, "learning_rate": 0.0001, "loss": 4.0831, "loss/crossentropy": 2.207589864730835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209608756005764, "step": 11848 }, { "epoch": 0.237, "grad_norm": 1.9140625, "grad_norm_var": 0.012180328369140625, "learning_rate": 0.0001, "loss": 4.1896, "loss/crossentropy": 2.2447429895401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016080766916275, "step": 11850 }, { "epoch": 0.23704, "grad_norm": 2.0625, "grad_norm_var": 0.009771474202473958, "learning_rate": 0.0001, "loss": 4.3133, "loss/crossentropy": 2.3349474668502808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22105249762535095, "step": 11852 }, { "epoch": 0.23708, "grad_norm": 1.9765625, "grad_norm_var": 0.008024088541666667, "learning_rate": 0.0001, "loss": 4.1091, "loss/crossentropy": 1.8444748520851135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993526816368103, "step": 11854 }, { "epoch": 0.23712, "grad_norm": 2.109375, "grad_norm_var": 0.009323883056640624, "learning_rate": 0.0001, "loss": 4.0758, "loss/crossentropy": 1.9181615710258484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698236346244812, "step": 11856 }, { "epoch": 0.23716, "grad_norm": 2.15625, "grad_norm_var": 0.009627024332682291, "learning_rate": 0.0001, "loss": 4.1454, "loss/crossentropy": 2.1930960416793823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122444212436676, "step": 11858 }, { "epoch": 0.2372, "grad_norm": 2.171875, "grad_norm_var": 0.009439849853515625, "learning_rate": 0.0001, "loss": 4.3177, "loss/crossentropy": 1.7835432887077332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20900961011648178, "step": 11860 }, { "epoch": 0.23724, "grad_norm": 2.15625, "grad_norm_var": 0.011775461832682292, "learning_rate": 0.0001, "loss": 4.6362, "loss/crossentropy": 2.1839439868927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21574077755212784, "step": 11862 }, { "epoch": 0.23728, "grad_norm": 1.96875, "grad_norm_var": 0.010791015625, "learning_rate": 0.0001, "loss": 3.8916, "loss/crossentropy": 1.9817007184028625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773921698331833, "step": 11864 }, { "epoch": 0.23732, "grad_norm": 1.9921875, "grad_norm_var": 0.008923085530598958, "learning_rate": 0.0001, "loss": 4.2065, "loss/crossentropy": 2.017501652240753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2247873619198799, "step": 11866 }, { "epoch": 0.23736, "grad_norm": 1.9140625, "grad_norm_var": 0.010992177327473958, "learning_rate": 0.0001, "loss": 3.8813, "loss/crossentropy": 2.0779114961624146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21136770397424698, "step": 11868 }, { "epoch": 0.2374, "grad_norm": 2.109375, "grad_norm_var": 0.010497029622395833, "learning_rate": 0.0001, "loss": 4.341, "loss/crossentropy": 2.3987231254577637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2326417714357376, "step": 11870 }, { "epoch": 0.23744, "grad_norm": 2.0625, "grad_norm_var": 0.008975982666015625, "learning_rate": 0.0001, "loss": 4.0466, "loss/crossentropy": 1.7145346999168396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19222228974103928, "step": 11872 }, { "epoch": 0.23748, "grad_norm": 2.015625, "grad_norm_var": 0.008829498291015625, "learning_rate": 0.0001, "loss": 4.2683, "loss/crossentropy": 2.378560423851013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21374844759702682, "step": 11874 }, { "epoch": 0.23752, "grad_norm": 2.0625, "grad_norm_var": 0.0073626200358072914, "learning_rate": 0.0001, "loss": 4.2548, "loss/crossentropy": 1.951455295085907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217455804347992, "step": 11876 }, { "epoch": 0.23756, "grad_norm": 1.8671875, "grad_norm_var": 0.00513916015625, "learning_rate": 0.0001, "loss": 3.8867, "loss/crossentropy": 1.9309074878692627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100483626127243, "step": 11878 }, { "epoch": 0.2376, "grad_norm": 2.015625, "grad_norm_var": 0.00445556640625, "learning_rate": 0.0001, "loss": 4.102, "loss/crossentropy": 2.2295292615890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2196313813328743, "step": 11880 }, { "epoch": 0.23764, "grad_norm": 1.9453125, "grad_norm_var": 0.004964192708333333, "learning_rate": 0.0001, "loss": 4.0514, "loss/crossentropy": 2.0665449500083923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20752248913049698, "step": 11882 }, { "epoch": 0.23768, "grad_norm": 2.1875, "grad_norm_var": 0.00560302734375, "learning_rate": 0.0001, "loss": 4.4022, "loss/crossentropy": 1.9645958542823792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20391641557216644, "step": 11884 }, { "epoch": 0.23772, "grad_norm": 1.984375, "grad_norm_var": 0.005956013997395833, "learning_rate": 0.0001, "loss": 4.1698, "loss/crossentropy": 2.1280174255371094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21947231888771057, "step": 11886 }, { "epoch": 0.23776, "grad_norm": 2.03125, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 4.2043, "loss/crossentropy": 1.9462851285934448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1961178332567215, "step": 11888 }, { "epoch": 0.2378, "grad_norm": 2.203125, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 4.4846, "loss/crossentropy": 2.249086618423462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22473402321338654, "step": 11890 }, { "epoch": 0.23784, "grad_norm": 1.9375, "grad_norm_var": 0.010472615559895834, "learning_rate": 0.0001, "loss": 4.1368, "loss/crossentropy": 2.1972378492355347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2164871245622635, "step": 11892 }, { "epoch": 0.23788, "grad_norm": 2.15625, "grad_norm_var": 0.009409332275390625, "learning_rate": 0.0001, "loss": 4.228, "loss/crossentropy": 2.0067209601402283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20195162296295166, "step": 11894 }, { "epoch": 0.23792, "grad_norm": 2.03125, "grad_norm_var": 0.009673817952473959, "learning_rate": 0.0001, "loss": 4.1767, "loss/crossentropy": 1.9103696942329407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19616805016994476, "step": 11896 }, { "epoch": 0.23796, "grad_norm": 1.984375, "grad_norm_var": 0.011595662434895833, "learning_rate": 0.0001, "loss": 4.228, "loss/crossentropy": 2.1524049639701843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21107815951108932, "step": 11898 }, { "epoch": 0.238, "grad_norm": 2.0, "grad_norm_var": 0.0110260009765625, "learning_rate": 0.0001, "loss": 4.183, "loss/crossentropy": 2.2112287878990173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2099093720316887, "step": 11900 }, { "epoch": 0.23804, "grad_norm": 2.390625, "grad_norm_var": 0.8914347330729167, "learning_rate": 0.0001, "loss": 4.647, "loss/crossentropy": 2.2945204973220825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2713918164372444, "step": 11902 }, { "epoch": 0.23808, "grad_norm": 2.078125, "grad_norm_var": 0.8781575520833333, "learning_rate": 0.0001, "loss": 4.4699, "loss/crossentropy": 2.1838968992233276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22640438377857208, "step": 11904 }, { "epoch": 0.23812, "grad_norm": 2.0, "grad_norm_var": 0.89010009765625, "learning_rate": 0.0001, "loss": 4.0843, "loss/crossentropy": 1.9089699983596802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19553899765014648, "step": 11906 }, { "epoch": 0.23816, "grad_norm": 2.03125, "grad_norm_var": 0.8873443603515625, "learning_rate": 0.0001, "loss": 4.2573, "loss/crossentropy": 2.105385661125183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21528839319944382, "step": 11908 }, { "epoch": 0.2382, "grad_norm": 2.015625, "grad_norm_var": 0.8921946207682292, "learning_rate": 0.0001, "loss": 4.0381, "loss/crossentropy": 1.565223515033722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17765602469444275, "step": 11910 }, { "epoch": 0.23824, "grad_norm": 2.078125, "grad_norm_var": 0.8919016520182291, "learning_rate": 0.0001, "loss": 4.3014, "loss/crossentropy": 2.220748543739319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21904101967811584, "step": 11912 }, { "epoch": 0.23828, "grad_norm": 2.15625, "grad_norm_var": 0.889306640625, "learning_rate": 0.0001, "loss": 4.2965, "loss/crossentropy": 2.294031500816345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21700909733772278, "step": 11914 }, { "epoch": 0.23832, "grad_norm": 2.171875, "grad_norm_var": 0.8795237223307292, "learning_rate": 0.0001, "loss": 4.3439, "loss/crossentropy": 2.072917103767395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20462248474359512, "step": 11916 }, { "epoch": 0.23836, "grad_norm": 2.03125, "grad_norm_var": 0.00533447265625, "learning_rate": 0.0001, "loss": 4.1843, "loss/crossentropy": 2.0541720390319824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127074897289276, "step": 11918 }, { "epoch": 0.2384, "grad_norm": 2.109375, "grad_norm_var": 0.0046539306640625, "learning_rate": 0.0001, "loss": 3.8922, "loss/crossentropy": 1.9873629808425903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19215020537376404, "step": 11920 }, { "epoch": 0.23844, "grad_norm": 2.09375, "grad_norm_var": 0.0032135009765625, "learning_rate": 0.0001, "loss": 4.1943, "loss/crossentropy": 2.447067141532898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2318919375538826, "step": 11922 }, { "epoch": 0.23848, "grad_norm": 2.0, "grad_norm_var": 0.0038157145182291666, "learning_rate": 0.0001, "loss": 3.8756, "loss/crossentropy": 1.9918025732040405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20865648984909058, "step": 11924 }, { "epoch": 0.23852, "grad_norm": 2.046875, "grad_norm_var": 0.0026041666666666665, "learning_rate": 0.0001, "loss": 4.3243, "loss/crossentropy": 2.0803070068359375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21753622591495514, "step": 11926 }, { "epoch": 0.23856, "grad_norm": 1.9921875, "grad_norm_var": 0.002976226806640625, "learning_rate": 0.0001, "loss": 4.1828, "loss/crossentropy": 2.118411421775818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21741919964551926, "step": 11928 }, { "epoch": 0.2386, "grad_norm": 1.9375, "grad_norm_var": 0.0034075419108072916, "learning_rate": 0.0001, "loss": 4.0056, "loss/crossentropy": 1.8924900889396667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20609601587057114, "step": 11930 }, { "epoch": 0.23864, "grad_norm": 2.046875, "grad_norm_var": 0.002418772379557292, "learning_rate": 0.0001, "loss": 4.0012, "loss/crossentropy": 1.743731439113617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19873473048210144, "step": 11932 }, { "epoch": 0.23868, "grad_norm": 2.015625, "grad_norm_var": 0.002929433186848958, "learning_rate": 0.0001, "loss": 4.2564, "loss/crossentropy": 2.291381061077118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21248165518045425, "step": 11934 }, { "epoch": 0.23872, "grad_norm": 2.109375, "grad_norm_var": 0.007452138264973958, "learning_rate": 0.0001, "loss": 4.3938, "loss/crossentropy": 1.7672501802444458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19201595336198807, "step": 11936 }, { "epoch": 0.23876, "grad_norm": 1.9453125, "grad_norm_var": 0.008162434895833333, "learning_rate": 0.0001, "loss": 3.8693, "loss/crossentropy": 1.8719280362129211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960783526301384, "step": 11938 }, { "epoch": 0.2388, "grad_norm": 2.28125, "grad_norm_var": 0.010978190104166667, "learning_rate": 0.0001, "loss": 4.3345, "loss/crossentropy": 1.8103876113891602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964937001466751, "step": 11940 }, { "epoch": 0.23884, "grad_norm": 2.03125, "grad_norm_var": 0.011901601155598959, "learning_rate": 0.0001, "loss": 4.2134, "loss/crossentropy": 1.8743855953216553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19404225796461105, "step": 11942 }, { "epoch": 0.23888, "grad_norm": 1.9453125, "grad_norm_var": 0.012562815348307292, "learning_rate": 0.0001, "loss": 4.2379, "loss/crossentropy": 2.3096803426742554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23256508260965347, "step": 11944 }, { "epoch": 0.23892, "grad_norm": 2.109375, "grad_norm_var": 0.012123362223307291, "learning_rate": 0.0001, "loss": 4.5089, "loss/crossentropy": 2.207027554512024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20778407156467438, "step": 11946 }, { "epoch": 0.23896, "grad_norm": 2.015625, "grad_norm_var": 0.011671702067057291, "learning_rate": 0.0001, "loss": 4.0423, "loss/crossentropy": 1.845999002456665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19381015002727509, "step": 11948 }, { "epoch": 0.239, "grad_norm": 2.03125, "grad_norm_var": 0.011628977457682292, "learning_rate": 0.0001, "loss": 4.1054, "loss/crossentropy": 1.677983045578003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17493421584367752, "step": 11950 }, { "epoch": 0.23904, "grad_norm": 2.140625, "grad_norm_var": 0.008107248942057292, "learning_rate": 0.0001, "loss": 4.1729, "loss/crossentropy": 2.1798466444015503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22395183145999908, "step": 11952 }, { "epoch": 0.23908, "grad_norm": 1.9140625, "grad_norm_var": 0.013525390625, "learning_rate": 0.0001, "loss": 4.0162, "loss/crossentropy": 2.008498191833496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069375425577164, "step": 11954 }, { "epoch": 0.23912, "grad_norm": 2.140625, "grad_norm_var": 0.014546712239583334, "learning_rate": 0.0001, "loss": 4.3736, "loss/crossentropy": 2.1712071895599365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21842175722122192, "step": 11956 }, { "epoch": 0.23916, "grad_norm": 2.578125, "grad_norm_var": 0.03242365519205729, "learning_rate": 0.0001, "loss": 4.1338, "loss/crossentropy": 2.2769562005996704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22064895182847977, "step": 11958 }, { "epoch": 0.2392, "grad_norm": 2.09375, "grad_norm_var": 0.032572428385416664, "learning_rate": 0.0001, "loss": 4.3401, "loss/crossentropy": 2.127632260322571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139139398932457, "step": 11960 }, { "epoch": 0.23924, "grad_norm": 2.046875, "grad_norm_var": 0.03223368326822917, "learning_rate": 0.0001, "loss": 4.3547, "loss/crossentropy": 2.2687970399856567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23024404793977737, "step": 11962 }, { "epoch": 0.23928, "grad_norm": 2.0625, "grad_norm_var": 0.032136027018229166, "learning_rate": 0.0001, "loss": 4.3532, "loss/crossentropy": 2.2271196246147156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22528529912233353, "step": 11964 }, { "epoch": 0.23932, "grad_norm": 2.03125, "grad_norm_var": 0.032136027018229166, "learning_rate": 0.0001, "loss": 4.1898, "loss/crossentropy": 2.3209941387176514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21150042116641998, "step": 11966 }, { "epoch": 0.23936, "grad_norm": 2.046875, "grad_norm_var": 0.03208719889322917, "learning_rate": 0.0001, "loss": 4.197, "loss/crossentropy": 1.9936136603355408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22582116723060608, "step": 11968 }, { "epoch": 0.2394, "grad_norm": 2.03125, "grad_norm_var": 0.024055989583333333, "learning_rate": 0.0001, "loss": 3.9045, "loss/crossentropy": 1.7509311437606812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19028881192207336, "step": 11970 }, { "epoch": 0.23944, "grad_norm": 2.140625, "grad_norm_var": 0.02213134765625, "learning_rate": 0.0001, "loss": 4.2365, "loss/crossentropy": 2.2120620012283325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23357221484184265, "step": 11972 }, { "epoch": 0.23948, "grad_norm": 2.109375, "grad_norm_var": 0.005686187744140625, "learning_rate": 0.0001, "loss": 4.0129, "loss/crossentropy": 1.794329285621643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17914240062236786, "step": 11974 }, { "epoch": 0.23952, "grad_norm": 2.03125, "grad_norm_var": 0.010550689697265626, "learning_rate": 0.0001, "loss": 4.1347, "loss/crossentropy": 2.1647136211395264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21009670197963715, "step": 11976 }, { "epoch": 0.23956, "grad_norm": 2.046875, "grad_norm_var": 0.010660552978515625, "learning_rate": 0.0001, "loss": 4.2365, "loss/crossentropy": 1.674091637134552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19617585837841034, "step": 11978 }, { "epoch": 0.2396, "grad_norm": 2.015625, "grad_norm_var": 0.011739095052083334, "learning_rate": 0.0001, "loss": 4.2474, "loss/crossentropy": 1.9591755867004395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18309858441352844, "step": 11980 }, { "epoch": 0.23964, "grad_norm": 2.34375, "grad_norm_var": 0.017862955729166668, "learning_rate": 0.0001, "loss": 4.3642, "loss/crossentropy": 2.0550093054771423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22483228147029877, "step": 11982 }, { "epoch": 0.23968, "grad_norm": 2.203125, "grad_norm_var": 0.019291178385416666, "learning_rate": 0.0001, "loss": 4.3937, "loss/crossentropy": 2.264032781124115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101321816444397, "step": 11984 }, { "epoch": 0.23972, "grad_norm": 2.015625, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 4.4341, "loss/crossentropy": 2.06082820892334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090815082192421, "step": 11986 }, { "epoch": 0.23976, "grad_norm": 2.0, "grad_norm_var": 0.0197662353515625, "learning_rate": 0.0001, "loss": 4.4159, "loss/crossentropy": 2.248009443283081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22642739117145538, "step": 11988 }, { "epoch": 0.2398, "grad_norm": 1.9140625, "grad_norm_var": 0.02211278279622396, "learning_rate": 0.0001, "loss": 3.9601, "loss/crossentropy": 1.797426462173462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18600185215473175, "step": 11990 }, { "epoch": 0.23984, "grad_norm": 2.015625, "grad_norm_var": 0.01573460896809896, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 2.5529314279556274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23957456648349762, "step": 11992 }, { "epoch": 0.23988, "grad_norm": 2.125, "grad_norm_var": 0.015933990478515625, "learning_rate": 0.0001, "loss": 4.2971, "loss/crossentropy": 1.9974916577339172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23130090534687042, "step": 11994 }, { "epoch": 0.23992, "grad_norm": 2.046875, "grad_norm_var": 0.014679972330729167, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 1.9714577794075012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969093456864357, "step": 11996 }, { "epoch": 0.23996, "grad_norm": 2.25, "grad_norm_var": 0.011055501302083333, "learning_rate": 0.0001, "loss": 4.282, "loss/crossentropy": 1.8047285079956055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19191773235797882, "step": 11998 }, { "epoch": 0.24, "grad_norm": 2.09375, "grad_norm_var": 0.009187825520833333, "learning_rate": 0.0001, "loss": 4.4078, "loss/crossentropy": 1.9240365028381348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19021467864513397, "step": 12000 }, { "epoch": 0.24004, "grad_norm": 2.125, "grad_norm_var": 0.009041086832682291, "learning_rate": 0.0001, "loss": 4.1657, "loss/crossentropy": 1.6503748297691345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18838192522525787, "step": 12002 }, { "epoch": 0.24008, "grad_norm": 2.109375, "grad_norm_var": 0.009368642171223959, "learning_rate": 0.0001, "loss": 4.3451, "loss/crossentropy": 2.238506555557251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139800265431404, "step": 12004 }, { "epoch": 0.24012, "grad_norm": 1.890625, "grad_norm_var": 0.008695220947265625, "learning_rate": 0.0001, "loss": 4.0978, "loss/crossentropy": 1.8623422384262085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19039735198020935, "step": 12006 }, { "epoch": 0.24016, "grad_norm": 2.109375, "grad_norm_var": 0.0096588134765625, "learning_rate": 0.0001, "loss": 3.9741, "loss/crossentropy": 1.975899577140808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20808710902929306, "step": 12008 }, { "epoch": 0.2402, "grad_norm": 2.15625, "grad_norm_var": 0.0108642578125, "learning_rate": 0.0001, "loss": 4.4332, "loss/crossentropy": 2.2959831953048706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23725779354572296, "step": 12010 }, { "epoch": 0.24024, "grad_norm": 1.8515625, "grad_norm_var": 0.015130360921223959, "learning_rate": 0.0001, "loss": 3.9117, "loss/crossentropy": 2.1626380681991577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068287953734398, "step": 12012 }, { "epoch": 0.24028, "grad_norm": 2.15625, "grad_norm_var": 0.013474273681640624, "learning_rate": 0.0001, "loss": 4.1614, "loss/crossentropy": 1.925924837589264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097126841545105, "step": 12014 }, { "epoch": 0.24032, "grad_norm": 2.046875, "grad_norm_var": 0.018302154541015626, "learning_rate": 0.0001, "loss": 4.5545, "loss/crossentropy": 2.147459626197815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2345646619796753, "step": 12016 }, { "epoch": 0.24036, "grad_norm": 1.8359375, "grad_norm_var": 0.021418253580729168, "learning_rate": 0.0001, "loss": 4.0219, "loss/crossentropy": 2.052124857902527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058626338839531, "step": 12018 }, { "epoch": 0.2404, "grad_norm": 1.8828125, "grad_norm_var": 0.021329752604166665, "learning_rate": 0.0001, "loss": 3.8619, "loss/crossentropy": 2.377007842063904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124926745891571, "step": 12020 }, { "epoch": 0.24044, "grad_norm": 2.0, "grad_norm_var": 0.020173136393229166, "learning_rate": 0.0001, "loss": 3.9542, "loss/crossentropy": 1.9117819666862488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976870447397232, "step": 12022 }, { "epoch": 0.24048, "grad_norm": 2.09375, "grad_norm_var": 0.021478017171223957, "learning_rate": 0.0001, "loss": 4.5771, "loss/crossentropy": 2.439339756965637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22178436815738678, "step": 12024 }, { "epoch": 0.24052, "grad_norm": 2.140625, "grad_norm_var": 0.02029596964518229, "learning_rate": 0.0001, "loss": 4.1182, "loss/crossentropy": 1.6923209428787231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19288206100463867, "step": 12026 }, { "epoch": 0.24056, "grad_norm": 1.9921875, "grad_norm_var": 0.016434478759765624, "learning_rate": 0.0001, "loss": 4.1733, "loss/crossentropy": 2.071919083595276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21092405915260315, "step": 12028 }, { "epoch": 0.2406, "grad_norm": 1.984375, "grad_norm_var": 0.016993967692057292, "learning_rate": 0.0001, "loss": 3.9688, "loss/crossentropy": 1.994953691959381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19953829050064087, "step": 12030 }, { "epoch": 0.24064, "grad_norm": 2.109375, "grad_norm_var": 0.011201731363932292, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.0545560121536255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20861412584781647, "step": 12032 }, { "epoch": 0.24068, "grad_norm": 1.8828125, "grad_norm_var": 0.0103424072265625, "learning_rate": 0.0001, "loss": 4.0859, "loss/crossentropy": 2.0211291909217834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146567404270172, "step": 12034 }, { "epoch": 0.24072, "grad_norm": 2.0625, "grad_norm_var": 0.012312825520833333, "learning_rate": 0.0001, "loss": 4.3156, "loss/crossentropy": 2.165773868560791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21522508561611176, "step": 12036 }, { "epoch": 0.24076, "grad_norm": 1.9609375, "grad_norm_var": 0.013216145833333333, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.0272024273872375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012138038873672, "step": 12038 }, { "epoch": 0.2408, "grad_norm": 1.9453125, "grad_norm_var": 0.011946360270182291, "learning_rate": 0.0001, "loss": 4.2419, "loss/crossentropy": 1.9311429262161255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19755128771066666, "step": 12040 }, { "epoch": 0.24084, "grad_norm": 2.0625, "grad_norm_var": 0.012086741129557292, "learning_rate": 0.0001, "loss": 4.1857, "loss/crossentropy": 2.0740894079208374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20651141554117203, "step": 12042 }, { "epoch": 0.24088, "grad_norm": 1.96875, "grad_norm_var": 0.012876129150390625, "learning_rate": 0.0001, "loss": 3.9882, "loss/crossentropy": 2.215203881263733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21735452860593796, "step": 12044 }, { "epoch": 0.24092, "grad_norm": 2.015625, "grad_norm_var": 0.02371190388997396, "learning_rate": 0.0001, "loss": 4.3755, "loss/crossentropy": 1.9994693994522095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001128643751144, "step": 12046 }, { "epoch": 0.24096, "grad_norm": 2.15625, "grad_norm_var": 0.025187174479166668, "learning_rate": 0.0001, "loss": 4.1145, "loss/crossentropy": 2.195701003074646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777013897895813, "step": 12048 }, { "epoch": 0.241, "grad_norm": 2.109375, "grad_norm_var": 0.02474950154622396, "learning_rate": 0.0001, "loss": 4.3486, "loss/crossentropy": 2.1975715160369873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21987678855657578, "step": 12050 }, { "epoch": 0.24104, "grad_norm": 1.953125, "grad_norm_var": 0.02188695271809896, "learning_rate": 0.0001, "loss": 4.1337, "loss/crossentropy": 2.225023865699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24454358220100403, "step": 12052 }, { "epoch": 0.24108, "grad_norm": 1.9296875, "grad_norm_var": 0.022200520833333334, "learning_rate": 0.0001, "loss": 4.0404, "loss/crossentropy": 1.892149806022644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110423520207405, "step": 12054 }, { "epoch": 0.24112, "grad_norm": 1.8984375, "grad_norm_var": 0.023579915364583332, "learning_rate": 0.0001, "loss": 4.1604, "loss/crossentropy": 1.9996158480644226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821269765496254, "step": 12056 }, { "epoch": 0.24116, "grad_norm": 2.109375, "grad_norm_var": 0.02569580078125, "learning_rate": 0.0001, "loss": 4.0591, "loss/crossentropy": 2.0965115427970886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140883505344391, "step": 12058 }, { "epoch": 0.2412, "grad_norm": 1.9296875, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 3.9494, "loss/crossentropy": 1.9169449210166931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034926936030388, "step": 12060 }, { "epoch": 0.24124, "grad_norm": 2.0625, "grad_norm_var": 0.015135701497395833, "learning_rate": 0.0001, "loss": 4.5, "loss/crossentropy": 2.3266680240631104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24314038455486298, "step": 12062 }, { "epoch": 0.24128, "grad_norm": 1.9921875, "grad_norm_var": 0.013509114583333334, "learning_rate": 0.0001, "loss": 4.1009, "loss/crossentropy": 2.0337759256362915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20842785388231277, "step": 12064 }, { "epoch": 0.24132, "grad_norm": 1.828125, "grad_norm_var": 0.013618977864583333, "learning_rate": 0.0001, "loss": 3.9186, "loss/crossentropy": 1.9502894878387451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1812899112701416, "step": 12066 }, { "epoch": 0.24136, "grad_norm": 1.9765625, "grad_norm_var": 0.013822174072265625, "learning_rate": 0.0001, "loss": 4.269, "loss/crossentropy": 2.13326895236969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21872679144144058, "step": 12068 }, { "epoch": 0.2414, "grad_norm": 2.125, "grad_norm_var": 0.013304646809895833, "learning_rate": 0.0001, "loss": 4.3594, "loss/crossentropy": 2.0836809873580933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212929405272007, "step": 12070 }, { "epoch": 0.24144, "grad_norm": 2.03125, "grad_norm_var": 0.012516276041666666, "learning_rate": 0.0001, "loss": 4.0467, "loss/crossentropy": 2.207913398742676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20640508085489273, "step": 12072 }, { "epoch": 0.24148, "grad_norm": 2.03125, "grad_norm_var": 0.01080322265625, "learning_rate": 0.0001, "loss": 4.1877, "loss/crossentropy": 2.0770451426506042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20633003860712051, "step": 12074 }, { "epoch": 0.24152, "grad_norm": 2.0625, "grad_norm_var": 0.010786946614583333, "learning_rate": 0.0001, "loss": 4.2237, "loss/crossentropy": 2.0042858719825745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18523608148097992, "step": 12076 }, { "epoch": 0.24156, "grad_norm": 1.9453125, "grad_norm_var": 0.010141754150390625, "learning_rate": 0.0001, "loss": 4.2755, "loss/crossentropy": 1.8673237562179565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19267796725034714, "step": 12078 }, { "epoch": 0.2416, "grad_norm": 2.046875, "grad_norm_var": 0.013793690999348959, "learning_rate": 0.0001, "loss": 3.9766, "loss/crossentropy": 2.0686238408088684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19344990700483322, "step": 12080 }, { "epoch": 0.24164, "grad_norm": 2.03125, "grad_norm_var": 0.009504954020182291, "learning_rate": 0.0001, "loss": 4.0366, "loss/crossentropy": 2.0069685578346252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1971682757139206, "step": 12082 }, { "epoch": 0.24168, "grad_norm": 2.0625, "grad_norm_var": 0.009468587239583333, "learning_rate": 0.0001, "loss": 4.1975, "loss/crossentropy": 2.2052754163742065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20646511763334274, "step": 12084 }, { "epoch": 0.24172, "grad_norm": 2.078125, "grad_norm_var": 0.008983357747395834, "learning_rate": 0.0001, "loss": 4.1362, "loss/crossentropy": 2.0343621373176575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19575881958007812, "step": 12086 }, { "epoch": 0.24176, "grad_norm": 1.890625, "grad_norm_var": 0.009894816080729167, "learning_rate": 0.0001, "loss": 3.9695, "loss/crossentropy": 2.2500641345977783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23151995986700058, "step": 12088 }, { "epoch": 0.2418, "grad_norm": 2.421875, "grad_norm_var": 0.0375640869140625, "learning_rate": 0.0001, "loss": 4.4361, "loss/crossentropy": 2.2596821784973145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22305716574192047, "step": 12090 }, { "epoch": 0.24184, "grad_norm": 1.9375, "grad_norm_var": 0.03715184529622396, "learning_rate": 0.0001, "loss": 4.0381, "loss/crossentropy": 2.0940088033676147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23409207165241241, "step": 12092 }, { "epoch": 0.24188, "grad_norm": 2.046875, "grad_norm_var": 0.03586018880208333, "learning_rate": 0.0001, "loss": 3.9715, "loss/crossentropy": 1.6980834603309631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19339510053396225, "step": 12094 }, { "epoch": 0.24192, "grad_norm": 2.09375, "grad_norm_var": 0.03047459920247396, "learning_rate": 0.0001, "loss": 4.192, "loss/crossentropy": 2.1921679973602295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21085387468338013, "step": 12096 }, { "epoch": 0.24196, "grad_norm": 2.34375, "grad_norm_var": 0.03551839192708333, "learning_rate": 0.0001, "loss": 4.3551, "loss/crossentropy": 2.18610817193985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2332247570157051, "step": 12098 }, { "epoch": 0.242, "grad_norm": 2.203125, "grad_norm_var": 0.038852691650390625, "learning_rate": 0.0001, "loss": 4.1216, "loss/crossentropy": 2.08358097076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21169763058423996, "step": 12100 }, { "epoch": 0.24204, "grad_norm": 2.140625, "grad_norm_var": 0.038913726806640625, "learning_rate": 0.0001, "loss": 4.2339, "loss/crossentropy": 2.1001542806625366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132650762796402, "step": 12102 }, { "epoch": 0.24208, "grad_norm": 2.109375, "grad_norm_var": 0.033614095052083334, "learning_rate": 0.0001, "loss": 4.2051, "loss/crossentropy": 1.9031851887702942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042345404624939, "step": 12104 }, { "epoch": 0.24212, "grad_norm": 1.96875, "grad_norm_var": 0.0127838134765625, "learning_rate": 0.0001, "loss": 4.0341, "loss/crossentropy": 1.670085072517395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878880336880684, "step": 12106 }, { "epoch": 0.24216, "grad_norm": 2.09375, "grad_norm_var": 0.012914784749348958, "learning_rate": 0.0001, "loss": 4.3395, "loss/crossentropy": 1.9557109475135803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211412936449051, "step": 12108 }, { "epoch": 0.2422, "grad_norm": 1.96875, "grad_norm_var": 0.013637034098307292, "learning_rate": 0.0001, "loss": 4.0107, "loss/crossentropy": 2.085852086544037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21336784213781357, "step": 12110 }, { "epoch": 0.24224, "grad_norm": 1.8671875, "grad_norm_var": 0.015721638997395832, "learning_rate": 0.0001, "loss": 4.0321, "loss/crossentropy": 2.1064602732658386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096766158938408, "step": 12112 }, { "epoch": 0.24228, "grad_norm": 2.109375, "grad_norm_var": 0.009325917561848958, "learning_rate": 0.0001, "loss": 4.2998, "loss/crossentropy": 2.381577968597412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23788201808929443, "step": 12114 }, { "epoch": 0.24232, "grad_norm": 1.96875, "grad_norm_var": 0.006758626302083333, "learning_rate": 0.0001, "loss": 4.0687, "loss/crossentropy": 1.7200234532356262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18620850890874863, "step": 12116 }, { "epoch": 0.24236, "grad_norm": 2.1875, "grad_norm_var": 0.007697550455729166, "learning_rate": 0.0001, "loss": 4.2672, "loss/crossentropy": 1.9106165170669556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21456392109394073, "step": 12118 }, { "epoch": 0.2424, "grad_norm": 1.9609375, "grad_norm_var": 0.007106272379557291, "learning_rate": 0.0001, "loss": 4.04, "loss/crossentropy": 1.9852410554885864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1927378550171852, "step": 12120 }, { "epoch": 0.24244, "grad_norm": 1.96875, "grad_norm_var": 0.007991282145182292, "learning_rate": 0.0001, "loss": 4.1278, "loss/crossentropy": 1.4948370456695557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1698828637599945, "step": 12122 }, { "epoch": 0.24248, "grad_norm": 1.9375, "grad_norm_var": 0.0078857421875, "learning_rate": 0.0001, "loss": 4.2874, "loss/crossentropy": 1.956885814666748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19851476699113846, "step": 12124 }, { "epoch": 0.24252, "grad_norm": 2.171875, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 3.8709, "loss/crossentropy": 1.9052257537841797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18438522517681122, "step": 12126 }, { "epoch": 0.24256, "grad_norm": 2.1875, "grad_norm_var": 0.008430735270182291, "learning_rate": 0.0001, "loss": 4.4347, "loss/crossentropy": 2.353670358657837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22648276388645172, "step": 12128 }, { "epoch": 0.2426, "grad_norm": 1.8203125, "grad_norm_var": 0.01263427734375, "learning_rate": 0.0001, "loss": 3.9896, "loss/crossentropy": 1.714030683040619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19275055080652237, "step": 12130 }, { "epoch": 0.24264, "grad_norm": 2.046875, "grad_norm_var": 0.012898763020833334, "learning_rate": 0.0001, "loss": 4.2981, "loss/crossentropy": 2.0048200488090515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20478381216526031, "step": 12132 }, { "epoch": 0.24268, "grad_norm": 2.125, "grad_norm_var": 0.011766560872395833, "learning_rate": 0.0001, "loss": 4.0497, "loss/crossentropy": 1.7151115536689758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19872941821813583, "step": 12134 }, { "epoch": 0.24272, "grad_norm": 2.109375, "grad_norm_var": 0.011818186442057291, "learning_rate": 0.0001, "loss": 4.2753, "loss/crossentropy": 2.1646838188171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22345983982086182, "step": 12136 }, { "epoch": 0.24276, "grad_norm": 2.046875, "grad_norm_var": 0.010807037353515625, "learning_rate": 0.0001, "loss": 4.157, "loss/crossentropy": 2.053311765193939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2143322378396988, "step": 12138 }, { "epoch": 0.2428, "grad_norm": 2.015625, "grad_norm_var": 0.009993235270182291, "learning_rate": 0.0001, "loss": 3.8805, "loss/crossentropy": 2.0471617579460144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20450014621019363, "step": 12140 }, { "epoch": 0.24284, "grad_norm": 2.1875, "grad_norm_var": 0.008737945556640625, "learning_rate": 0.0001, "loss": 4.2966, "loss/crossentropy": 2.1727033853530884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22905820608139038, "step": 12142 }, { "epoch": 0.24288, "grad_norm": 2.09375, "grad_norm_var": 0.008245595296223958, "learning_rate": 0.0001, "loss": 4.3861, "loss/crossentropy": 2.197450280189514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22059273719787598, "step": 12144 }, { "epoch": 0.24292, "grad_norm": 2.171875, "grad_norm_var": 0.003413899739583333, "learning_rate": 0.0001, "loss": 4.4841, "loss/crossentropy": 1.9954137206077576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22441548854112625, "step": 12146 }, { "epoch": 0.24296, "grad_norm": 2.0, "grad_norm_var": 0.005110677083333333, "learning_rate": 0.0001, "loss": 4.2113, "loss/crossentropy": 2.3393132090568542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22949577867984772, "step": 12148 }, { "epoch": 0.243, "grad_norm": 1.953125, "grad_norm_var": 0.0067291259765625, "learning_rate": 0.0001, "loss": 4.146, "loss/crossentropy": 2.11602646112442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19181709736585617, "step": 12150 }, { "epoch": 0.24304, "grad_norm": 2.078125, "grad_norm_var": 0.005597941080729167, "learning_rate": 0.0001, "loss": 4.1381, "loss/crossentropy": 2.130657136440277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20904043316841125, "step": 12152 }, { "epoch": 0.24308, "grad_norm": 2.078125, "grad_norm_var": 0.0053293863932291664, "learning_rate": 0.0001, "loss": 4.2983, "loss/crossentropy": 2.3923556804656982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23029828071594238, "step": 12154 }, { "epoch": 0.24312, "grad_norm": 1.875, "grad_norm_var": 0.007209269205729166, "learning_rate": 0.0001, "loss": 3.9697, "loss/crossentropy": 2.024892747402191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879258424043655, "step": 12156 }, { "epoch": 0.24316, "grad_norm": 1.921875, "grad_norm_var": 0.007826487223307291, "learning_rate": 0.0001, "loss": 3.8761, "loss/crossentropy": 1.7833393812179565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18542324006557465, "step": 12158 }, { "epoch": 0.2432, "grad_norm": 1.890625, "grad_norm_var": 0.007328033447265625, "learning_rate": 0.0001, "loss": 4.1896, "loss/crossentropy": 2.253599762916565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148476019501686, "step": 12160 }, { "epoch": 0.24324, "grad_norm": 2.046875, "grad_norm_var": 0.005177561442057292, "learning_rate": 0.0001, "loss": 4.0253, "loss/crossentropy": 1.7218471765518188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923334002494812, "step": 12162 }, { "epoch": 0.24328, "grad_norm": 1.9609375, "grad_norm_var": 0.007575480143229166, "learning_rate": 0.0001, "loss": 4.2947, "loss/crossentropy": 2.210463523864746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203536182641983, "step": 12164 }, { "epoch": 0.24332, "grad_norm": 2.0625, "grad_norm_var": 0.0076487223307291664, "learning_rate": 0.0001, "loss": 4.4164, "loss/crossentropy": 2.452837347984314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22888235747814178, "step": 12166 }, { "epoch": 0.24336, "grad_norm": 1.984375, "grad_norm_var": 0.007657877604166667, "learning_rate": 0.0001, "loss": 4.2157, "loss/crossentropy": 2.3511279821395874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2431751787662506, "step": 12168 }, { "epoch": 0.2434, "grad_norm": 2.03125, "grad_norm_var": 0.007819620768229167, "learning_rate": 0.0001, "loss": 4.0911, "loss/crossentropy": 2.0734334588050842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137738972902298, "step": 12170 }, { "epoch": 0.24344, "grad_norm": 2.03125, "grad_norm_var": 0.0065582275390625, "learning_rate": 0.0001, "loss": 4.0987, "loss/crossentropy": 1.9492397904396057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22894760966300964, "step": 12172 }, { "epoch": 0.24348, "grad_norm": 2.046875, "grad_norm_var": 0.005936431884765625, "learning_rate": 0.0001, "loss": 4.2316, "loss/crossentropy": 2.03458708524704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20981666445732117, "step": 12174 }, { "epoch": 0.24352, "grad_norm": 1.96875, "grad_norm_var": 0.007059478759765625, "learning_rate": 0.0001, "loss": 4.3396, "loss/crossentropy": 2.1464394330978394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20741496980190277, "step": 12176 }, { "epoch": 0.24356, "grad_norm": 2.015625, "grad_norm_var": 0.0064999898274739586, "learning_rate": 0.0001, "loss": 3.9902, "loss/crossentropy": 1.8449034094810486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21950078010559082, "step": 12178 }, { "epoch": 0.2436, "grad_norm": 2.15625, "grad_norm_var": 0.0054433186848958336, "learning_rate": 0.0001, "loss": 4.0129, "loss/crossentropy": 1.8646993041038513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117064744234085, "step": 12180 }, { "epoch": 0.24364, "grad_norm": 1.859375, "grad_norm_var": 0.008003743489583333, "learning_rate": 0.0001, "loss": 3.9347, "loss/crossentropy": 1.9643146991729736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1946149319410324, "step": 12182 }, { "epoch": 0.24368, "grad_norm": 2.109375, "grad_norm_var": 0.008141835530598959, "learning_rate": 0.0001, "loss": 4.1468, "loss/crossentropy": 1.867002248764038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19111115485429764, "step": 12184 }, { "epoch": 0.24372, "grad_norm": 2.0625, "grad_norm_var": 0.008129628499348958, "learning_rate": 0.0001, "loss": 4.0443, "loss/crossentropy": 1.9431232810020447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19612180441617966, "step": 12186 }, { "epoch": 0.24376, "grad_norm": 2.25, "grad_norm_var": 0.0105224609375, "learning_rate": 0.0001, "loss": 4.2931, "loss/crossentropy": 2.197216033935547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22340577840805054, "step": 12188 }, { "epoch": 0.2438, "grad_norm": 2.109375, "grad_norm_var": 0.010469563802083333, "learning_rate": 0.0001, "loss": 4.2499, "loss/crossentropy": 1.916576623916626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20536810904741287, "step": 12190 }, { "epoch": 0.24384, "grad_norm": 2.25, "grad_norm_var": 0.010497029622395833, "learning_rate": 0.0001, "loss": 4.3838, "loss/crossentropy": 2.0369369983673096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26386965811252594, "step": 12192 }, { "epoch": 0.24388, "grad_norm": 2.09375, "grad_norm_var": 0.01639404296875, "learning_rate": 0.0001, "loss": 4.2817, "loss/crossentropy": 2.086443066596985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21351207792758942, "step": 12194 }, { "epoch": 0.24392, "grad_norm": 2.015625, "grad_norm_var": 0.01619873046875, "learning_rate": 0.0001, "loss": 4.3649, "loss/crossentropy": 2.0969839096069336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21489758789539337, "step": 12196 }, { "epoch": 0.24396, "grad_norm": 1.859375, "grad_norm_var": 0.016463216145833334, "learning_rate": 0.0001, "loss": 4.2481, "loss/crossentropy": 2.086832642555237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20651830732822418, "step": 12198 }, { "epoch": 0.244, "grad_norm": 2.015625, "grad_norm_var": 0.017146809895833334, "learning_rate": 0.0001, "loss": 4.1397, "loss/crossentropy": 1.9104391932487488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963956654071808, "step": 12200 }, { "epoch": 0.24404, "grad_norm": 2.03125, "grad_norm_var": 0.0191314697265625, "learning_rate": 0.0001, "loss": 3.8558, "loss/crossentropy": 1.9419523477554321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19279541075229645, "step": 12202 }, { "epoch": 0.24408, "grad_norm": 1.9921875, "grad_norm_var": 0.01697998046875, "learning_rate": 0.0001, "loss": 4.3239, "loss/crossentropy": 2.2867754697799683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23261378705501556, "step": 12204 }, { "epoch": 0.24412, "grad_norm": 1.90625, "grad_norm_var": 0.017634073893229168, "learning_rate": 0.0001, "loss": 4.0786, "loss/crossentropy": 1.941315233707428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20962880551815033, "step": 12206 }, { "epoch": 0.24416, "grad_norm": 2.140625, "grad_norm_var": 0.01541748046875, "learning_rate": 0.0001, "loss": 4.2044, "loss/crossentropy": 2.0638798475265503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21499283611774445, "step": 12208 }, { "epoch": 0.2442, "grad_norm": 2.140625, "grad_norm_var": 0.008454386393229167, "learning_rate": 0.0001, "loss": 4.1889, "loss/crossentropy": 2.3167499899864197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21805942803621292, "step": 12210 }, { "epoch": 0.24424, "grad_norm": 2.0, "grad_norm_var": 0.00848388671875, "learning_rate": 0.0001, "loss": 4.0956, "loss/crossentropy": 2.1785646080970764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21259736269712448, "step": 12212 }, { "epoch": 0.24428, "grad_norm": 2.03125, "grad_norm_var": 0.006615193684895834, "learning_rate": 0.0001, "loss": 4.2054, "loss/crossentropy": 1.983469545841217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21091710776090622, "step": 12214 }, { "epoch": 0.24432, "grad_norm": 2.078125, "grad_norm_var": 0.006937408447265625, "learning_rate": 0.0001, "loss": 4.2283, "loss/crossentropy": 1.8501896858215332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20537292212247849, "step": 12216 }, { "epoch": 0.24436, "grad_norm": 2.1875, "grad_norm_var": 0.0072934468587239586, "learning_rate": 0.0001, "loss": 4.182, "loss/crossentropy": 1.7293490767478943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916719302535057, "step": 12218 }, { "epoch": 0.2444, "grad_norm": 2.03125, "grad_norm_var": 0.0069488525390625, "learning_rate": 0.0001, "loss": 4.3247, "loss/crossentropy": 2.0931429862976074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22081860899925232, "step": 12220 }, { "epoch": 0.24444, "grad_norm": 2.03125, "grad_norm_var": 0.007657623291015625, "learning_rate": 0.0001, "loss": 4.0619, "loss/crossentropy": 2.157875657081604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20259525626897812, "step": 12222 }, { "epoch": 0.24448, "grad_norm": 2.0625, "grad_norm_var": 0.007248687744140625, "learning_rate": 0.0001, "loss": 4.2712, "loss/crossentropy": 2.2028552889823914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2431233748793602, "step": 12224 }, { "epoch": 0.24452, "grad_norm": 2.046875, "grad_norm_var": 0.006473541259765625, "learning_rate": 0.0001, "loss": 4.1184, "loss/crossentropy": 2.0906929969787598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051575481891632, "step": 12226 }, { "epoch": 0.24456, "grad_norm": 2.078125, "grad_norm_var": 0.006091054280598958, "learning_rate": 0.0001, "loss": 4.5601, "loss/crossentropy": 2.5020272731781006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2485819309949875, "step": 12228 }, { "epoch": 0.2446, "grad_norm": 8.0625, "grad_norm_var": 2.244887034098307, "learning_rate": 0.0001, "loss": 4.1465, "loss/crossentropy": 1.502736508846283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839340552687645, "step": 12230 }, { "epoch": 0.24464, "grad_norm": 2.484375, "grad_norm_var": 2.2357358296712238, "learning_rate": 0.0001, "loss": 4.3882, "loss/crossentropy": 2.1888676285743713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.226266011595726, "step": 12232 }, { "epoch": 0.24468, "grad_norm": 2.125, "grad_norm_var": 2.243033599853516, "learning_rate": 0.0001, "loss": 4.3887, "loss/crossentropy": 2.377061367034912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23195213079452515, "step": 12234 }, { "epoch": 0.24472, "grad_norm": 2.03125, "grad_norm_var": 2.257559967041016, "learning_rate": 0.0001, "loss": 3.9354, "loss/crossentropy": 2.0567076206207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20983586460351944, "step": 12236 }, { "epoch": 0.24476, "grad_norm": 2.015625, "grad_norm_var": 2.246906534830729, "learning_rate": 0.0001, "loss": 4.1978, "loss/crossentropy": 2.007763922214508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21983467787504196, "step": 12238 }, { "epoch": 0.2448, "grad_norm": 2.078125, "grad_norm_var": 2.247749837239583, "learning_rate": 0.0001, "loss": 4.1411, "loss/crossentropy": 2.3309481143951416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23099908232688904, "step": 12240 }, { "epoch": 0.24484, "grad_norm": 2.5, "grad_norm_var": 2.23668212890625, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 1.8231184482574463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22009263187646866, "step": 12242 }, { "epoch": 0.24488, "grad_norm": 1.9609375, "grad_norm_var": 2.2464637756347656, "learning_rate": 0.0001, "loss": 4.1131, "loss/crossentropy": 2.140045642852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22202204167842865, "step": 12244 }, { "epoch": 0.24492, "grad_norm": 1.984375, "grad_norm_var": 0.02835057576497396, "learning_rate": 0.0001, "loss": 4.2209, "loss/crossentropy": 1.8495931029319763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19803623855113983, "step": 12246 }, { "epoch": 0.24496, "grad_norm": 2.171875, "grad_norm_var": 0.0185943603515625, "learning_rate": 0.0001, "loss": 4.1624, "loss/crossentropy": 2.2141982913017273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23418182879686356, "step": 12248 }, { "epoch": 0.245, "grad_norm": 2.03125, "grad_norm_var": 0.01844482421875, "learning_rate": 0.0001, "loss": 4.4104, "loss/crossentropy": 2.14465594291687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22683896869421005, "step": 12250 }, { "epoch": 0.24504, "grad_norm": 2.0625, "grad_norm_var": 0.01610107421875, "learning_rate": 0.0001, "loss": 4.1913, "loss/crossentropy": 2.0804443359375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265845239162445, "step": 12252 }, { "epoch": 0.24508, "grad_norm": 2.046875, "grad_norm_var": 0.0160308837890625, "learning_rate": 0.0001, "loss": 4.485, "loss/crossentropy": 2.2451776266098022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22881492972373962, "step": 12254 }, { "epoch": 0.24512, "grad_norm": 1.953125, "grad_norm_var": 0.016947428385416668, "learning_rate": 0.0001, "loss": 4.0869, "loss/crossentropy": 1.6861794590950012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18126793205738068, "step": 12256 }, { "epoch": 0.24516, "grad_norm": 1.96875, "grad_norm_var": 0.0059773763020833336, "learning_rate": 0.0001, "loss": 4.2343, "loss/crossentropy": 1.87660551071167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20990663766860962, "step": 12258 }, { "epoch": 0.2452, "grad_norm": 2.15625, "grad_norm_var": 0.006799062093098958, "learning_rate": 0.0001, "loss": 4.2019, "loss/crossentropy": 2.103124976158142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19615671038627625, "step": 12260 }, { "epoch": 0.24524, "grad_norm": 2.0, "grad_norm_var": 0.0069048563639322914, "learning_rate": 0.0001, "loss": 4.2509, "loss/crossentropy": 2.139270842075348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21861301362514496, "step": 12262 }, { "epoch": 0.24528, "grad_norm": 2.140625, "grad_norm_var": 0.00791015625, "learning_rate": 0.0001, "loss": 4.4081, "loss/crossentropy": 2.108114778995514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20996354520320892, "step": 12264 }, { "epoch": 0.24532, "grad_norm": 1.9453125, "grad_norm_var": 0.008847808837890625, "learning_rate": 0.0001, "loss": 3.9495, "loss/crossentropy": 2.1205111145973206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21352755278348923, "step": 12266 }, { "epoch": 0.24536, "grad_norm": 2.109375, "grad_norm_var": 0.009248860677083333, "learning_rate": 0.0001, "loss": 3.9946, "loss/crossentropy": 2.20136821269989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2277800738811493, "step": 12268 }, { "epoch": 0.2454, "grad_norm": 2.046875, "grad_norm_var": 0.011336008707682291, "learning_rate": 0.0001, "loss": 4.1116, "loss/crossentropy": 2.1149147748947144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062554806470871, "step": 12270 }, { "epoch": 0.24544, "grad_norm": 2.4375, "grad_norm_var": 0.01932347615559896, "learning_rate": 0.0001, "loss": 4.5226, "loss/crossentropy": 2.1119648218154907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2922344133257866, "step": 12272 }, { "epoch": 0.24548, "grad_norm": 2.0625, "grad_norm_var": 0.017964680989583332, "learning_rate": 0.0001, "loss": 4.1766, "loss/crossentropy": 2.2631434202194214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044210433959961, "step": 12274 }, { "epoch": 0.24552, "grad_norm": 2.125, "grad_norm_var": 0.06928609212239584, "learning_rate": 0.0001, "loss": 4.0956, "loss/crossentropy": 2.264181971549988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23541904985904694, "step": 12276 }, { "epoch": 0.24556, "grad_norm": 2.0625, "grad_norm_var": 0.06836649576822916, "learning_rate": 0.0001, "loss": 4.2083, "loss/crossentropy": 2.256329298019409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21784210950136185, "step": 12278 }, { "epoch": 0.2456, "grad_norm": 2.21875, "grad_norm_var": 0.06968765258789063, "learning_rate": 0.0001, "loss": 4.267, "loss/crossentropy": 2.0962833166122437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.211236834526062, "step": 12280 }, { "epoch": 0.24564, "grad_norm": 2.140625, "grad_norm_var": 0.0683990478515625, "learning_rate": 0.0001, "loss": 4.4767, "loss/crossentropy": 2.3430999517440796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25170228630304337, "step": 12282 }, { "epoch": 0.24568, "grad_norm": 2.125, "grad_norm_var": 0.06921361287434896, "learning_rate": 0.0001, "loss": 4.1492, "loss/crossentropy": 1.8256065845489502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20632392168045044, "step": 12284 }, { "epoch": 0.24572, "grad_norm": 1.96875, "grad_norm_var": 0.06643778483072917, "learning_rate": 0.0001, "loss": 4.294, "loss/crossentropy": 2.481971561908722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121504619717598, "step": 12286 }, { "epoch": 0.24576, "grad_norm": 2.125, "grad_norm_var": 0.060469563802083334, "learning_rate": 0.0001, "loss": 4.2136, "loss/crossentropy": 2.2815581560134888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22004567831754684, "step": 12288 }, { "epoch": 0.2458, "grad_norm": 1.9921875, "grad_norm_var": 0.060323079427083336, "learning_rate": 0.0001, "loss": 4.2426, "loss/crossentropy": 1.7125394940376282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19618325680494308, "step": 12290 }, { "epoch": 0.24584, "grad_norm": 1.859375, "grad_norm_var": 0.008347320556640624, "learning_rate": 0.0001, "loss": 3.9676, "loss/crossentropy": 2.0333253145217896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20251524448394775, "step": 12292 }, { "epoch": 0.24588, "grad_norm": 1.984375, "grad_norm_var": 0.008939361572265625, "learning_rate": 0.0001, "loss": 4.0958, "loss/crossentropy": 2.359953284263611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23697812855243683, "step": 12294 }, { "epoch": 0.24592, "grad_norm": 2.0625, "grad_norm_var": 0.0065348307291666664, "learning_rate": 0.0001, "loss": 3.9914, "loss/crossentropy": 1.8465049266815186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18830078095197678, "step": 12296 }, { "epoch": 0.24596, "grad_norm": 2.109375, "grad_norm_var": 0.00662841796875, "learning_rate": 0.0001, "loss": 4.1669, "loss/crossentropy": 2.407967984676361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20054074376821518, "step": 12298 }, { "epoch": 0.246, "grad_norm": 2.015625, "grad_norm_var": 0.007005818684895833, "learning_rate": 0.0001, "loss": 4.1943, "loss/crossentropy": 2.306265115737915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2286146879196167, "step": 12300 }, { "epoch": 0.24604, "grad_norm": 1.90625, "grad_norm_var": 0.0076416015625, "learning_rate": 0.0001, "loss": 4.108, "loss/crossentropy": 1.9776363968849182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015514224767685, "step": 12302 }, { "epoch": 0.24608, "grad_norm": 1.859375, "grad_norm_var": 0.010041300455729167, "learning_rate": 0.0001, "loss": 3.7023, "loss/crossentropy": 1.6589386463165283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16657201945781708, "step": 12304 }, { "epoch": 0.24612, "grad_norm": 1.9765625, "grad_norm_var": 0.009968058268229166, "learning_rate": 0.0001, "loss": 4.2749, "loss/crossentropy": 2.263510227203369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21220041066408157, "step": 12306 }, { "epoch": 0.24616, "grad_norm": 2.015625, "grad_norm_var": 0.008583323160807291, "learning_rate": 0.0001, "loss": 4.1735, "loss/crossentropy": 2.0802704095840454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20156628638505936, "step": 12308 }, { "epoch": 0.2462, "grad_norm": 2.0, "grad_norm_var": 0.015193430582682292, "learning_rate": 0.0001, "loss": 4.2597, "loss/crossentropy": 2.0921266674995422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22044362872838974, "step": 12310 }, { "epoch": 0.24624, "grad_norm": 1.8671875, "grad_norm_var": 0.0164703369140625, "learning_rate": 0.0001, "loss": 3.8587, "loss/crossentropy": 1.7366862297058105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1722203940153122, "step": 12312 }, { "epoch": 0.24628, "grad_norm": 2.046875, "grad_norm_var": 0.016377766927083332, "learning_rate": 0.0001, "loss": 4.1408, "loss/crossentropy": 2.0741729140281677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942591667175293, "step": 12314 }, { "epoch": 0.24632, "grad_norm": 2.0625, "grad_norm_var": 0.0145751953125, "learning_rate": 0.0001, "loss": 4.1343, "loss/crossentropy": 2.0611414909362793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22244945168495178, "step": 12316 }, { "epoch": 0.24636, "grad_norm": 2.015625, "grad_norm_var": 0.014196523030598958, "learning_rate": 0.0001, "loss": 3.8586, "loss/crossentropy": 1.9337337017059326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20655318349599838, "step": 12318 }, { "epoch": 0.2464, "grad_norm": 2.09375, "grad_norm_var": 0.012141672770182292, "learning_rate": 0.0001, "loss": 3.9197, "loss/crossentropy": 1.8766502737998962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851816102862358, "step": 12320 }, { "epoch": 0.24644, "grad_norm": 2.0625, "grad_norm_var": 0.012552897135416666, "learning_rate": 0.0001, "loss": 4.2407, "loss/crossentropy": 2.121203899383545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2177201583981514, "step": 12322 }, { "epoch": 0.24648, "grad_norm": 2.03125, "grad_norm_var": 0.0136138916015625, "learning_rate": 0.0001, "loss": 4.1441, "loss/crossentropy": 1.8102782368659973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19694476574659348, "step": 12324 }, { "epoch": 0.24652, "grad_norm": 1.984375, "grad_norm_var": 0.0062164306640625, "learning_rate": 0.0001, "loss": 4.2526, "loss/crossentropy": 2.323423147201538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20612536370754242, "step": 12326 }, { "epoch": 0.24656, "grad_norm": 2.15625, "grad_norm_var": 0.007814280192057292, "learning_rate": 0.0001, "loss": 4.4862, "loss/crossentropy": 2.2835768461227417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22697383165359497, "step": 12328 }, { "epoch": 0.2466, "grad_norm": 2.09375, "grad_norm_var": 0.006192779541015625, "learning_rate": 0.0001, "loss": 4.2868, "loss/crossentropy": 2.197639048099518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22446971386671066, "step": 12330 }, { "epoch": 0.24664, "grad_norm": 1.9296875, "grad_norm_var": 0.006761678059895833, "learning_rate": 0.0001, "loss": 4.0758, "loss/crossentropy": 2.1731618642807007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21908622235059738, "step": 12332 }, { "epoch": 0.24668, "grad_norm": 1.875, "grad_norm_var": 0.009291330973307291, "learning_rate": 0.0001, "loss": 3.9108, "loss/crossentropy": 1.9236284494400024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810280755162239, "step": 12334 }, { "epoch": 0.24672, "grad_norm": 2.0, "grad_norm_var": 0.008318837483723958, "learning_rate": 0.0001, "loss": 4.3975, "loss/crossentropy": 2.2902809381484985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23192713409662247, "step": 12336 }, { "epoch": 0.24676, "grad_norm": 2.015625, "grad_norm_var": 0.008367665608723958, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 1.9808775186538696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19344759732484818, "step": 12338 }, { "epoch": 0.2468, "grad_norm": 1.921875, "grad_norm_var": 0.008337148030598958, "learning_rate": 0.0001, "loss": 4.1581, "loss/crossentropy": 2.148995041847229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19994106143712997, "step": 12340 }, { "epoch": 0.24684, "grad_norm": 2.109375, "grad_norm_var": 0.008528391520182291, "learning_rate": 0.0001, "loss": 4.3927, "loss/crossentropy": 2.374568462371826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24350540339946747, "step": 12342 }, { "epoch": 0.24688, "grad_norm": 2.1875, "grad_norm_var": 0.006959788004557292, "learning_rate": 0.0001, "loss": 4.258, "loss/crossentropy": 1.9702014923095703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038111537694931, "step": 12344 }, { "epoch": 0.24692, "grad_norm": 1.921875, "grad_norm_var": 0.007008616129557292, "learning_rate": 0.0001, "loss": 4.0126, "loss/crossentropy": 1.8438855409622192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822836771607399, "step": 12346 }, { "epoch": 0.24696, "grad_norm": 2.265625, "grad_norm_var": 0.010754140218098958, "learning_rate": 0.0001, "loss": 4.1352, "loss/crossentropy": 2.005755662918091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21399007737636566, "step": 12348 }, { "epoch": 0.247, "grad_norm": 2.015625, "grad_norm_var": 0.008819325764973959, "learning_rate": 0.0001, "loss": 4.0988, "loss/crossentropy": 1.8640305399894714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19055305421352386, "step": 12350 }, { "epoch": 0.24704, "grad_norm": 2.140625, "grad_norm_var": 0.010453033447265624, "learning_rate": 0.0001, "loss": 4.1981, "loss/crossentropy": 2.1293725967407227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117801234126091, "step": 12352 }, { "epoch": 0.24708, "grad_norm": 2.484375, "grad_norm_var": 0.02337621053059896, "learning_rate": 0.0001, "loss": 4.2666, "loss/crossentropy": 1.7655459642410278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085946872830391, "step": 12354 }, { "epoch": 0.24712, "grad_norm": 2.046875, "grad_norm_var": 0.021897125244140624, "learning_rate": 0.0001, "loss": 4.3175, "loss/crossentropy": 2.184986114501953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22597889602184296, "step": 12356 }, { "epoch": 0.24716, "grad_norm": 2.015625, "grad_norm_var": 0.021897125244140624, "learning_rate": 0.0001, "loss": 4.1295, "loss/crossentropy": 2.368021607398987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21957845985889435, "step": 12358 }, { "epoch": 0.2472, "grad_norm": 2.046875, "grad_norm_var": 0.020645904541015624, "learning_rate": 0.0001, "loss": 4.16, "loss/crossentropy": 2.297884225845337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155354768037796, "step": 12360 }, { "epoch": 0.24724, "grad_norm": 2.328125, "grad_norm_var": 0.023273722330729166, "learning_rate": 0.0001, "loss": 4.2181, "loss/crossentropy": 2.121878147125244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22192668169736862, "step": 12362 }, { "epoch": 0.24728, "grad_norm": 2.125, "grad_norm_var": 0.020442454020182292, "learning_rate": 0.0001, "loss": 4.3233, "loss/crossentropy": 2.130228877067566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20712029188871384, "step": 12364 }, { "epoch": 0.24732, "grad_norm": 1.9921875, "grad_norm_var": 0.02367121378580729, "learning_rate": 0.0001, "loss": 3.8542, "loss/crossentropy": 1.6520383954048157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16511157900094986, "step": 12366 }, { "epoch": 0.24736, "grad_norm": 2.1875, "grad_norm_var": 0.02220637003580729, "learning_rate": 0.0001, "loss": 4.1894, "loss/crossentropy": 1.9993118047714233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112603336572647, "step": 12368 }, { "epoch": 0.2474, "grad_norm": 2.046875, "grad_norm_var": 0.011494700113932292, "learning_rate": 0.0001, "loss": 4.0522, "loss/crossentropy": 1.9502257108688354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944574937224388, "step": 12370 }, { "epoch": 0.24744, "grad_norm": 1.9765625, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 4.1098, "loss/crossentropy": 2.1219520568847656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22872696816921234, "step": 12372 }, { "epoch": 0.24748, "grad_norm": 1.984375, "grad_norm_var": 0.0502593994140625, "learning_rate": 0.0001, "loss": 4.172, "loss/crossentropy": 2.0668599605560303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21508124470710754, "step": 12374 }, { "epoch": 0.24752, "grad_norm": 1.984375, "grad_norm_var": 0.05098241170247396, "learning_rate": 0.0001, "loss": 4.1901, "loss/crossentropy": 2.242877721786499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308686003088951, "step": 12376 }, { "epoch": 0.24756, "grad_norm": 2.015625, "grad_norm_var": 0.047304026285807294, "learning_rate": 0.0001, "loss": 3.8779, "loss/crossentropy": 1.8258161544799805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1852153167128563, "step": 12378 }, { "epoch": 0.2476, "grad_norm": 2.046875, "grad_norm_var": 0.04918390909830729, "learning_rate": 0.0001, "loss": 4.1241, "loss/crossentropy": 2.0654167532920837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20526036620140076, "step": 12380 }, { "epoch": 0.24764, "grad_norm": 2.046875, "grad_norm_var": 0.04582087198893229, "learning_rate": 0.0001, "loss": 4.1601, "loss/crossentropy": 2.125216484069824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216343455016613, "step": 12382 }, { "epoch": 0.24768, "grad_norm": 1.9453125, "grad_norm_var": 0.0473052978515625, "learning_rate": 0.0001, "loss": 4.1756, "loss/crossentropy": 2.285245180130005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22941745072603226, "step": 12384 }, { "epoch": 0.24772, "grad_norm": 2.046875, "grad_norm_var": 0.04720637003580729, "learning_rate": 0.0001, "loss": 4.1097, "loss/crossentropy": 2.0148350596427917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1874203458428383, "step": 12386 }, { "epoch": 0.24776, "grad_norm": 2.03125, "grad_norm_var": 0.04812825520833333, "learning_rate": 0.0001, "loss": 4.3127, "loss/crossentropy": 2.2349741458892822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23187313228845596, "step": 12388 }, { "epoch": 0.2478, "grad_norm": 2.078125, "grad_norm_var": 0.009178670247395833, "learning_rate": 0.0001, "loss": 4.1875, "loss/crossentropy": 2.195721983909607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119324654340744, "step": 12390 }, { "epoch": 0.24784, "grad_norm": 2.265625, "grad_norm_var": 0.012676747639973958, "learning_rate": 0.0001, "loss": 4.5732, "loss/crossentropy": 2.1261669397354126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21143332868814468, "step": 12392 }, { "epoch": 0.24788, "grad_norm": 1.9453125, "grad_norm_var": 0.012798817952473958, "learning_rate": 0.0001, "loss": 3.9475, "loss/crossentropy": 2.1640073657035828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192412167787552, "step": 12394 }, { "epoch": 0.24792, "grad_norm": 1.953125, "grad_norm_var": 0.011230214436848959, "learning_rate": 0.0001, "loss": 4.2435, "loss/crossentropy": 1.9555792808532715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110436111688614, "step": 12396 }, { "epoch": 0.24796, "grad_norm": 2.015625, "grad_norm_var": 0.011156972249348958, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 1.948053002357483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19957780838012695, "step": 12398 }, { "epoch": 0.248, "grad_norm": 1.9453125, "grad_norm_var": 0.011156972249348958, "learning_rate": 0.0001, "loss": 4.0865, "loss/crossentropy": 2.355304718017578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20301833003759384, "step": 12400 }, { "epoch": 0.24804, "grad_norm": 2.0, "grad_norm_var": 0.010944620768229166, "learning_rate": 0.0001, "loss": 4.2537, "loss/crossentropy": 2.172769784927368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157917320728302, "step": 12402 }, { "epoch": 0.24808, "grad_norm": 1.8671875, "grad_norm_var": 0.012109120686848959, "learning_rate": 0.0001, "loss": 4.2817, "loss/crossentropy": 2.2186524868011475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.221016563475132, "step": 12404 }, { "epoch": 0.24812, "grad_norm": 1.8828125, "grad_norm_var": 0.012113189697265625, "learning_rate": 0.0001, "loss": 3.9624, "loss/crossentropy": 2.011409044265747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19150879979133606, "step": 12406 }, { "epoch": 0.24816, "grad_norm": 2.125, "grad_norm_var": 0.010984039306640625, "learning_rate": 0.0001, "loss": 4.5039, "loss/crossentropy": 1.9245591163635254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2296570986509323, "step": 12408 }, { "epoch": 0.2482, "grad_norm": 2.328125, "grad_norm_var": 0.016434733072916666, "learning_rate": 0.0001, "loss": 4.5765, "loss/crossentropy": 1.973130702972412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20532477647066116, "step": 12410 }, { "epoch": 0.24824, "grad_norm": 2.03125, "grad_norm_var": 0.01617609659830729, "learning_rate": 0.0001, "loss": 3.914, "loss/crossentropy": 1.655932605266571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1609746441245079, "step": 12412 }, { "epoch": 0.24828, "grad_norm": 2.40625, "grad_norm_var": 0.02411677042643229, "learning_rate": 0.0001, "loss": 4.2172, "loss/crossentropy": 2.104023277759552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19898276031017303, "step": 12414 }, { "epoch": 0.24832, "grad_norm": 1.9921875, "grad_norm_var": 0.02210261027018229, "learning_rate": 0.0001, "loss": 4.0597, "loss/crossentropy": 1.9362882375717163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20107803493738174, "step": 12416 }, { "epoch": 0.24836, "grad_norm": 2.046875, "grad_norm_var": 0.021996815999348957, "learning_rate": 0.0001, "loss": 4.4181, "loss/crossentropy": 2.1508368253707886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056911736726761, "step": 12418 }, { "epoch": 0.2484, "grad_norm": 1.921875, "grad_norm_var": 0.02194391886393229, "learning_rate": 0.0001, "loss": 3.802, "loss/crossentropy": 1.7712991833686829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1908227875828743, "step": 12420 }, { "epoch": 0.24844, "grad_norm": 2.140625, "grad_norm_var": 0.02060114542643229, "learning_rate": 0.0001, "loss": 4.3557, "loss/crossentropy": 2.2705806493759155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23912303894758224, "step": 12422 }, { "epoch": 0.24848, "grad_norm": 1.9765625, "grad_norm_var": 0.019791666666666666, "learning_rate": 0.0001, "loss": 4.2696, "loss/crossentropy": 2.4222676753997803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21739919483661652, "step": 12424 }, { "epoch": 0.24852, "grad_norm": 2.03125, "grad_norm_var": 0.0145172119140625, "learning_rate": 0.0001, "loss": 4.1935, "loss/crossentropy": 1.945872962474823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402413070201874, "step": 12426 }, { "epoch": 0.24856, "grad_norm": 1.8671875, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 4.0908, "loss/crossentropy": 1.7793864011764526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19452669471502304, "step": 12428 }, { "epoch": 0.2486, "grad_norm": 2.0, "grad_norm_var": 0.015941365559895834, "learning_rate": 0.0001, "loss": 4.1045, "loss/crossentropy": 2.2827813625335693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21572840213775635, "step": 12430 }, { "epoch": 0.24864, "grad_norm": 2.03125, "grad_norm_var": 0.016544342041015625, "learning_rate": 0.0001, "loss": 3.749, "loss/crossentropy": 1.9543398022651672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20825360715389252, "step": 12432 }, { "epoch": 0.24868, "grad_norm": 2.046875, "grad_norm_var": 0.017276763916015625, "learning_rate": 0.0001, "loss": 4.3727, "loss/crossentropy": 2.223303198814392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21834088116884232, "step": 12434 }, { "epoch": 0.24872, "grad_norm": 1.9140625, "grad_norm_var": 0.017071278889973958, "learning_rate": 0.0001, "loss": 3.9993, "loss/crossentropy": 1.6987267136573792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1834847405552864, "step": 12436 }, { "epoch": 0.24876, "grad_norm": 2.125, "grad_norm_var": 0.014522043863932292, "learning_rate": 0.0001, "loss": 4.4309, "loss/crossentropy": 2.3947317600250244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22872642427682877, "step": 12438 }, { "epoch": 0.2488, "grad_norm": 2.015625, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 4.4858, "loss/crossentropy": 2.5481021404266357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23773372173309326, "step": 12440 }, { "epoch": 0.24884, "grad_norm": 2.015625, "grad_norm_var": 0.0145904541015625, "learning_rate": 0.0001, "loss": 4.1621, "loss/crossentropy": 2.2349241971969604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21584660559892654, "step": 12442 }, { "epoch": 0.24888, "grad_norm": 2.140625, "grad_norm_var": 0.012988026936848958, "learning_rate": 0.0001, "loss": 4.2827, "loss/crossentropy": 1.9308242201805115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19295598566532135, "step": 12444 }, { "epoch": 0.24892, "grad_norm": 1.8515625, "grad_norm_var": 0.008536783854166667, "learning_rate": 0.0001, "loss": 4.0778, "loss/crossentropy": 2.3508042097091675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22008787840604782, "step": 12446 }, { "epoch": 0.24896, "grad_norm": 2.109375, "grad_norm_var": 0.012593332926432292, "learning_rate": 0.0001, "loss": 4.0865, "loss/crossentropy": 2.0870128870010376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20749760419130325, "step": 12448 }, { "epoch": 0.249, "grad_norm": 2.109375, "grad_norm_var": 0.012971750895182292, "learning_rate": 0.0001, "loss": 4.2758, "loss/crossentropy": 2.087821125984192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110248565673828, "step": 12450 }, { "epoch": 0.24904, "grad_norm": 2.078125, "grad_norm_var": 0.011494954427083334, "learning_rate": 0.0001, "loss": 4.3075, "loss/crossentropy": 2.131770372390747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21426154673099518, "step": 12452 }, { "epoch": 0.24908, "grad_norm": 2.046875, "grad_norm_var": 0.011092122395833333, "learning_rate": 0.0001, "loss": 4.2373, "loss/crossentropy": 2.175555467605591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20418058335781097, "step": 12454 }, { "epoch": 0.24912, "grad_norm": 2.1875, "grad_norm_var": 0.012198893229166667, "learning_rate": 0.0001, "loss": 4.2788, "loss/crossentropy": 2.205165147781372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23265192657709122, "step": 12456 }, { "epoch": 0.24916, "grad_norm": 2.046875, "grad_norm_var": 0.014534250895182291, "learning_rate": 0.0001, "loss": 3.9345, "loss/crossentropy": 1.6410180926322937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18058068305253983, "step": 12458 }, { "epoch": 0.2492, "grad_norm": 1.9375, "grad_norm_var": 0.014345041910807292, "learning_rate": 0.0001, "loss": 3.962, "loss/crossentropy": 1.8249012231826782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19164791703224182, "step": 12460 }, { "epoch": 0.24924, "grad_norm": 2.234375, "grad_norm_var": 0.0142242431640625, "learning_rate": 0.0001, "loss": 4.4945, "loss/crossentropy": 2.0906582474708557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23415963351726532, "step": 12462 }, { "epoch": 0.24928, "grad_norm": 2.078125, "grad_norm_var": 0.009285227457682291, "learning_rate": 0.0001, "loss": 4.4702, "loss/crossentropy": 2.314555048942566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24343416839838028, "step": 12464 }, { "epoch": 0.24932, "grad_norm": 1.9921875, "grad_norm_var": 0.009105428059895834, "learning_rate": 0.0001, "loss": 4.3059, "loss/crossentropy": 2.1258983612060547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21851430088281631, "step": 12466 }, { "epoch": 0.24936, "grad_norm": 2.03125, "grad_norm_var": 0.009325917561848958, "learning_rate": 0.0001, "loss": 4.1642, "loss/crossentropy": 1.977954626083374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20545368641614914, "step": 12468 }, { "epoch": 0.2494, "grad_norm": 1.9609375, "grad_norm_var": 0.011258951822916667, "learning_rate": 0.0001, "loss": 4.2224, "loss/crossentropy": 2.0212838649749756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21720624715089798, "step": 12470 }, { "epoch": 0.24944, "grad_norm": 2.0625, "grad_norm_var": 0.010205078125, "learning_rate": 0.0001, "loss": 4.3526, "loss/crossentropy": 1.964124321937561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19991052150726318, "step": 12472 }, { "epoch": 0.24948, "grad_norm": 2.078125, "grad_norm_var": 0.008567047119140626, "learning_rate": 0.0001, "loss": 4.2633, "loss/crossentropy": 1.9038777947425842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19013714790344238, "step": 12474 }, { "epoch": 0.24952, "grad_norm": 2.171875, "grad_norm_var": 0.008449045817057292, "learning_rate": 0.0001, "loss": 4.4785, "loss/crossentropy": 2.1772372722625732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21470360457897186, "step": 12476 }, { "epoch": 0.24956, "grad_norm": 2.109375, "grad_norm_var": 0.006605784098307292, "learning_rate": 0.0001, "loss": 4.3636, "loss/crossentropy": 2.153541684150696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23221681267023087, "step": 12478 }, { "epoch": 0.2496, "grad_norm": 1.9765625, "grad_norm_var": 0.006668853759765625, "learning_rate": 0.0001, "loss": 4.4578, "loss/crossentropy": 2.421363592147827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20812640339136124, "step": 12480 }, { "epoch": 0.24964, "grad_norm": 2.046875, "grad_norm_var": 0.008188629150390625, "learning_rate": 0.0001, "loss": 4.0628, "loss/crossentropy": 1.8497061133384705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18766893446445465, "step": 12482 }, { "epoch": 0.24968, "grad_norm": 2.0625, "grad_norm_var": 0.008894602457682291, "learning_rate": 0.0001, "loss": 4.1011, "loss/crossentropy": 1.9876007437705994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932743340730667, "step": 12484 }, { "epoch": 0.24972, "grad_norm": 2.0625, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.2722, "loss/crossentropy": 2.4254921674728394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23315788805484772, "step": 12486 }, { "epoch": 0.24976, "grad_norm": 1.96875, "grad_norm_var": 0.0079010009765625, "learning_rate": 0.0001, "loss": 4.1012, "loss/crossentropy": 1.983458697795868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190450206398964, "step": 12488 }, { "epoch": 0.2498, "grad_norm": 2.046875, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 4.0284, "loss/crossentropy": 1.8051987886428833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19166412204504013, "step": 12490 }, { "epoch": 0.24984, "grad_norm": 2.140625, "grad_norm_var": 0.0052398681640625, "learning_rate": 0.0001, "loss": 4.2385, "loss/crossentropy": 2.1552056670188904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053346112370491, "step": 12492 }, { "epoch": 0.24988, "grad_norm": 2.0625, "grad_norm_var": 0.004992421468098958, "learning_rate": 0.0001, "loss": 4.0157, "loss/crossentropy": 1.9644648432731628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17631876468658447, "step": 12494 }, { "epoch": 0.24992, "grad_norm": 1.90625, "grad_norm_var": 0.006089019775390625, "learning_rate": 0.0001, "loss": 4.1626, "loss/crossentropy": 2.027154862880707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21671167761087418, "step": 12496 }, { "epoch": 0.24996, "grad_norm": 2.125, "grad_norm_var": 0.00628662109375, "learning_rate": 0.0001, "loss": 4.2485, "loss/crossentropy": 2.2398791313171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22507991641759872, "step": 12498 }, { "epoch": 0.25, "grad_norm": 1.9296875, "grad_norm_var": 0.006232706705729166, "learning_rate": 0.0001, "loss": 4.13, "loss/crossentropy": 2.2342909574508667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192724049091339, "step": 12500 }, { "epoch": 0.25004, "grad_norm": 2.0, "grad_norm_var": 0.0054443359375, "learning_rate": 0.0001, "loss": 4.2779, "loss/crossentropy": 1.8621744513511658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882321536540985, "step": 12502 }, { "epoch": 0.25008, "grad_norm": 2.109375, "grad_norm_var": 0.004988606770833333, "learning_rate": 0.0001, "loss": 4.4516, "loss/crossentropy": 2.2012354135513306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128412127494812, "step": 12504 }, { "epoch": 0.25012, "grad_norm": 1.9921875, "grad_norm_var": 0.005873362223307292, "learning_rate": 0.0001, "loss": 4.0122, "loss/crossentropy": 1.7629758715629578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928938776254654, "step": 12506 }, { "epoch": 0.25016, "grad_norm": 2.0625, "grad_norm_var": 0.005730946858723958, "learning_rate": 0.0001, "loss": 4.1563, "loss/crossentropy": 2.076514720916748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20530561357736588, "step": 12508 }, { "epoch": 0.2502, "grad_norm": 1.9453125, "grad_norm_var": 0.006207021077473959, "learning_rate": 0.0001, "loss": 4.2005, "loss/crossentropy": 2.004107654094696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21081873774528503, "step": 12510 }, { "epoch": 0.25024, "grad_norm": 2.234375, "grad_norm_var": 0.008013661702473958, "learning_rate": 0.0001, "loss": 4.3323, "loss/crossentropy": 2.232061505317688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21327269077301025, "step": 12512 }, { "epoch": 0.25028, "grad_norm": 2.0625, "grad_norm_var": 0.0068662007649739586, "learning_rate": 0.0001, "loss": 4.1507, "loss/crossentropy": 2.1506210565567017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20797627419233322, "step": 12514 }, { "epoch": 0.25032, "grad_norm": 2.03125, "grad_norm_var": 0.008207194010416667, "learning_rate": 0.0001, "loss": 4.1999, "loss/crossentropy": 2.084704279899597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25306878983974457, "step": 12516 }, { "epoch": 0.25036, "grad_norm": 2.0625, "grad_norm_var": 0.008137003580729166, "learning_rate": 0.0001, "loss": 4.3286, "loss/crossentropy": 2.000536620616913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20802763104438782, "step": 12518 }, { "epoch": 0.2504, "grad_norm": 1.9140625, "grad_norm_var": 0.0088531494140625, "learning_rate": 0.0001, "loss": 4.1324, "loss/crossentropy": 2.2391778230667114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171899899840355, "step": 12520 }, { "epoch": 0.25044, "grad_norm": 1.9453125, "grad_norm_var": 0.011375935872395833, "learning_rate": 0.0001, "loss": 4.458, "loss/crossentropy": 2.1465210914611816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026364952325821, "step": 12522 }, { "epoch": 0.25048, "grad_norm": 1.9296875, "grad_norm_var": 0.011226145426432292, "learning_rate": 0.0001, "loss": 4.0804, "loss/crossentropy": 1.8129625916481018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17917531728744507, "step": 12524 }, { "epoch": 0.25052, "grad_norm": 2.0, "grad_norm_var": 0.010029856363932292, "learning_rate": 0.0001, "loss": 4.1832, "loss/crossentropy": 1.98322331905365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20269985496997833, "step": 12526 }, { "epoch": 0.25056, "grad_norm": 2.1875, "grad_norm_var": 0.008955637613932291, "learning_rate": 0.0001, "loss": 4.373, "loss/crossentropy": 2.196588397026062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23827942460775375, "step": 12528 }, { "epoch": 0.2506, "grad_norm": 2.09375, "grad_norm_var": 0.009209950764973959, "learning_rate": 0.0001, "loss": 4.3811, "loss/crossentropy": 2.183007001876831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071165144443512, "step": 12530 }, { "epoch": 0.25064, "grad_norm": 1.9140625, "grad_norm_var": 0.0106353759765625, "learning_rate": 0.0001, "loss": 4.4205, "loss/crossentropy": 2.2728021144866943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21004344522953033, "step": 12532 }, { "epoch": 0.25068, "grad_norm": 2.125, "grad_norm_var": 0.0111083984375, "learning_rate": 0.0001, "loss": 4.5478, "loss/crossentropy": 2.280518889427185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22367073595523834, "step": 12534 }, { "epoch": 0.25072, "grad_norm": 2.25, "grad_norm_var": 0.015038045247395833, "learning_rate": 0.0001, "loss": 4.135, "loss/crossentropy": 2.088103711605072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038320079445839, "step": 12536 }, { "epoch": 0.25076, "grad_norm": 2.140625, "grad_norm_var": 0.012837727864583334, "learning_rate": 0.0001, "loss": 4.1798, "loss/crossentropy": 1.646530568599701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18982253968715668, "step": 12538 }, { "epoch": 0.2508, "grad_norm": 2.09375, "grad_norm_var": 0.013622792561848958, "learning_rate": 0.0001, "loss": 4.2246, "loss/crossentropy": 2.1743921041488647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22835461795330048, "step": 12540 }, { "epoch": 0.25084, "grad_norm": 1.9375, "grad_norm_var": 0.0151123046875, "learning_rate": 0.0001, "loss": 4.3138, "loss/crossentropy": 2.2216947078704834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21978579461574554, "step": 12542 }, { "epoch": 0.25088, "grad_norm": 1.9921875, "grad_norm_var": 0.014890289306640625, "learning_rate": 0.0001, "loss": 4.298, "loss/crossentropy": 2.141101062297821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21347828209400177, "step": 12544 }, { "epoch": 0.25092, "grad_norm": 2.078125, "grad_norm_var": 0.014861806233723959, "learning_rate": 0.0001, "loss": 4.1534, "loss/crossentropy": 2.1961969137191772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192479968070984, "step": 12546 }, { "epoch": 0.25096, "grad_norm": 1.9609375, "grad_norm_var": 0.014274088541666667, "learning_rate": 0.0001, "loss": 3.9219, "loss/crossentropy": 1.9632240533828735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19182069599628448, "step": 12548 }, { "epoch": 0.251, "grad_norm": 2.359375, "grad_norm_var": 0.02173639933268229, "learning_rate": 0.0001, "loss": 4.0594, "loss/crossentropy": 1.7821694612503052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24946660548448563, "step": 12550 }, { "epoch": 0.25104, "grad_norm": 2.078125, "grad_norm_var": 0.016290028889973957, "learning_rate": 0.0001, "loss": 4.2423, "loss/crossentropy": 2.261335611343384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23269569873809814, "step": 12552 }, { "epoch": 0.25108, "grad_norm": 1.921875, "grad_norm_var": 0.01762669881184896, "learning_rate": 0.0001, "loss": 3.9858, "loss/crossentropy": 2.2365309596061707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119913324713707, "step": 12554 }, { "epoch": 0.25112, "grad_norm": 2.078125, "grad_norm_var": 0.0176025390625, "learning_rate": 0.0001, "loss": 3.9798, "loss/crossentropy": 2.0685681104660034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985978305339813, "step": 12556 }, { "epoch": 0.25116, "grad_norm": 2.015625, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 4.0231, "loss/crossentropy": 2.0773105025291443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2006489858031273, "step": 12558 }, { "epoch": 0.2512, "grad_norm": 2.09375, "grad_norm_var": 0.01561279296875, "learning_rate": 0.0001, "loss": 4.1948, "loss/crossentropy": 2.1145309805870056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20449287444353104, "step": 12560 }, { "epoch": 0.25124, "grad_norm": 1.828125, "grad_norm_var": 0.017097981770833333, "learning_rate": 0.0001, "loss": 4.0543, "loss/crossentropy": 2.109993577003479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20761344581842422, "step": 12562 }, { "epoch": 0.25128, "grad_norm": 1.9375, "grad_norm_var": 0.016755167643229166, "learning_rate": 0.0001, "loss": 4.1661, "loss/crossentropy": 1.9653338193893433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069985270500183, "step": 12564 }, { "epoch": 0.25132, "grad_norm": 2.015625, "grad_norm_var": 0.007045237223307291, "learning_rate": 0.0001, "loss": 4.101, "loss/crossentropy": 1.7084832191467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18922459334135056, "step": 12566 }, { "epoch": 0.25136, "grad_norm": 2.046875, "grad_norm_var": 0.006494140625, "learning_rate": 0.0001, "loss": 3.9288, "loss/crossentropy": 1.8007041215896606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122679501771927, "step": 12568 }, { "epoch": 0.2514, "grad_norm": 2.125, "grad_norm_var": 0.007972971598307291, "learning_rate": 0.0001, "loss": 4.3199, "loss/crossentropy": 2.041845440864563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2219957411289215, "step": 12570 }, { "epoch": 0.25144, "grad_norm": 1.8671875, "grad_norm_var": 0.008577219645182292, "learning_rate": 0.0001, "loss": 4.3703, "loss/crossentropy": 2.2668861150741577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117021307349205, "step": 12572 }, { "epoch": 0.25148, "grad_norm": 1.890625, "grad_norm_var": 0.010235341389973958, "learning_rate": 0.0001, "loss": 3.8462, "loss/crossentropy": 1.8246251940727234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822521835565567, "step": 12574 }, { "epoch": 0.25152, "grad_norm": 2.046875, "grad_norm_var": 0.010553995768229166, "learning_rate": 0.0001, "loss": 3.8137, "loss/crossentropy": 2.0739742517471313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2114427089691162, "step": 12576 }, { "epoch": 0.25156, "grad_norm": 2.03125, "grad_norm_var": 0.008906809488932292, "learning_rate": 0.0001, "loss": 3.9261, "loss/crossentropy": 1.6614344120025635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18680214881896973, "step": 12578 }, { "epoch": 0.2516, "grad_norm": 1.9765625, "grad_norm_var": 0.01279296875, "learning_rate": 0.0001, "loss": 4.4042, "loss/crossentropy": 2.5629080533981323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26097629219293594, "step": 12580 }, { "epoch": 0.25164, "grad_norm": 2.015625, "grad_norm_var": 0.01337890625, "learning_rate": 0.0001, "loss": 4.1583, "loss/crossentropy": 2.058123230934143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2273598164319992, "step": 12582 }, { "epoch": 0.25168, "grad_norm": 2.15625, "grad_norm_var": 0.014314524332682292, "learning_rate": 0.0001, "loss": 4.2802, "loss/crossentropy": 1.9959335327148438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991373971104622, "step": 12584 }, { "epoch": 0.25172, "grad_norm": 2.140625, "grad_norm_var": 0.018293253580729165, "learning_rate": 0.0001, "loss": 3.9794, "loss/crossentropy": 1.8165839314460754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1888013407588005, "step": 12586 }, { "epoch": 0.25176, "grad_norm": 2.03125, "grad_norm_var": 0.015221913655598959, "learning_rate": 0.0001, "loss": 4.0687, "loss/crossentropy": 1.9918802976608276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18641646206378937, "step": 12588 }, { "epoch": 0.2518, "grad_norm": 2.078125, "grad_norm_var": 0.013378651936848958, "learning_rate": 0.0001, "loss": 4.3443, "loss/crossentropy": 2.0256036520004272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19628985226154327, "step": 12590 }, { "epoch": 0.25184, "grad_norm": 2.09375, "grad_norm_var": 0.01236572265625, "learning_rate": 0.0001, "loss": 4.1317, "loss/crossentropy": 1.948347806930542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19971590489149094, "step": 12592 }, { "epoch": 0.25188, "grad_norm": 2.046875, "grad_norm_var": 0.012544759114583333, "learning_rate": 0.0001, "loss": 3.8528, "loss/crossentropy": 2.2025747299194336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2437238171696663, "step": 12594 }, { "epoch": 0.25192, "grad_norm": 2.078125, "grad_norm_var": 0.010872141520182291, "learning_rate": 0.0001, "loss": 4.2102, "loss/crossentropy": 2.3209575414657593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366873100399971, "step": 12596 }, { "epoch": 0.25196, "grad_norm": 1.9921875, "grad_norm_var": 0.010846964518229167, "learning_rate": 0.0001, "loss": 3.8028, "loss/crossentropy": 1.7978705763816833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19252559542655945, "step": 12598 }, { "epoch": 0.252, "grad_norm": 2.015625, "grad_norm_var": 0.010359446207682291, "learning_rate": 0.0001, "loss": 3.9269, "loss/crossentropy": 1.964760184288025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1943562552332878, "step": 12600 }, { "epoch": 0.25204, "grad_norm": 2.0, "grad_norm_var": 0.005399322509765625, "learning_rate": 0.0001, "loss": 4.2205, "loss/crossentropy": 2.1420929431915283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206790953874588, "step": 12602 }, { "epoch": 0.25208, "grad_norm": 2.234375, "grad_norm_var": 0.008084869384765625, "learning_rate": 0.0001, "loss": 4.5232, "loss/crossentropy": 2.2422659397125244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22513067722320557, "step": 12604 }, { "epoch": 0.25212, "grad_norm": 2.75, "grad_norm_var": 0.04026667277018229, "learning_rate": 0.0001, "loss": 3.8394, "loss/crossentropy": 1.587006688117981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1833646520972252, "step": 12606 }, { "epoch": 0.25216, "grad_norm": 2.09375, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 4.4461, "loss/crossentropy": 2.426867365837097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435208261013031, "step": 12608 }, { "epoch": 0.2522, "grad_norm": 2.0, "grad_norm_var": 0.040185546875, "learning_rate": 0.0001, "loss": 4.0039, "loss/crossentropy": 1.9733251333236694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21772438287734985, "step": 12610 }, { "epoch": 0.25224, "grad_norm": 2.84375, "grad_norm_var": 0.0764312744140625, "learning_rate": 0.0001, "loss": 4.2712, "loss/crossentropy": 1.4835429191589355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17013566195964813, "step": 12612 }, { "epoch": 0.25228, "grad_norm": 2.0, "grad_norm_var": 0.07500178019205729, "learning_rate": 0.0001, "loss": 3.8985, "loss/crossentropy": 2.165693759918213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21315700560808182, "step": 12614 }, { "epoch": 0.25232, "grad_norm": 2.09375, "grad_norm_var": 0.07396647135416666, "learning_rate": 0.0001, "loss": 3.9856, "loss/crossentropy": 2.206323266029358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19913922995328903, "step": 12616 }, { "epoch": 0.25236, "grad_norm": 2.015625, "grad_norm_var": 0.07515360514322916, "learning_rate": 0.0001, "loss": 4.0788, "loss/crossentropy": 2.2520995140075684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22997721284627914, "step": 12618 }, { "epoch": 0.2524, "grad_norm": 2.140625, "grad_norm_var": 0.07444559733072917, "learning_rate": 0.0001, "loss": 4.365, "loss/crossentropy": 1.9322227239608765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20600315928459167, "step": 12620 }, { "epoch": 0.25244, "grad_norm": 1.96875, "grad_norm_var": 0.047459920247395836, "learning_rate": 0.0001, "loss": 4.0578, "loss/crossentropy": 2.140220284461975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21072514355182648, "step": 12622 }, { "epoch": 0.25248, "grad_norm": 1.96875, "grad_norm_var": 0.04942118326822917, "learning_rate": 0.0001, "loss": 4.2316, "loss/crossentropy": 2.061431884765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2259845808148384, "step": 12624 }, { "epoch": 0.25252, "grad_norm": 2.125, "grad_norm_var": 0.047055816650390624, "learning_rate": 0.0001, "loss": 4.14, "loss/crossentropy": 2.323551654815674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23168490827083588, "step": 12626 }, { "epoch": 0.25256, "grad_norm": 1.890625, "grad_norm_var": 0.008335113525390625, "learning_rate": 0.0001, "loss": 4.0036, "loss/crossentropy": 1.93999582529068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21139459311962128, "step": 12628 }, { "epoch": 0.2526, "grad_norm": 2.140625, "grad_norm_var": 0.011173248291015625, "learning_rate": 0.0001, "loss": 4.207, "loss/crossentropy": 1.9334313869476318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21537292003631592, "step": 12630 }, { "epoch": 0.25264, "grad_norm": 2.0, "grad_norm_var": 0.010477447509765625, "learning_rate": 0.0001, "loss": 4.1674, "loss/crossentropy": 2.145465135574341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22080931067466736, "step": 12632 }, { "epoch": 0.25268, "grad_norm": 1.9375, "grad_norm_var": 0.011668904622395834, "learning_rate": 0.0001, "loss": 4.0058, "loss/crossentropy": 1.9708059430122375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007170245051384, "step": 12634 }, { "epoch": 0.25272, "grad_norm": 2.015625, "grad_norm_var": 0.0115631103515625, "learning_rate": 0.0001, "loss": 3.9649, "loss/crossentropy": 1.9522746801376343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20030274242162704, "step": 12636 }, { "epoch": 0.25276, "grad_norm": 2.0625, "grad_norm_var": 0.012035878499348958, "learning_rate": 0.0001, "loss": 4.0416, "loss/crossentropy": 1.856387436389923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19685833156108856, "step": 12638 }, { "epoch": 0.2528, "grad_norm": 2.296875, "grad_norm_var": 0.013787587483723959, "learning_rate": 0.0001, "loss": 4.475, "loss/crossentropy": 1.9449518322944641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20622673630714417, "step": 12640 }, { "epoch": 0.25284, "grad_norm": 2.25, "grad_norm_var": 0.017533365885416666, "learning_rate": 0.0001, "loss": 4.4683, "loss/crossentropy": 2.4559473991394043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21476459503173828, "step": 12642 }, { "epoch": 0.25288, "grad_norm": 1.9296875, "grad_norm_var": 0.016778310139973957, "learning_rate": 0.0001, "loss": 4.3164, "loss/crossentropy": 2.238897919654846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22538839280605316, "step": 12644 }, { "epoch": 0.25292, "grad_norm": 2.078125, "grad_norm_var": 0.013944498697916667, "learning_rate": 0.0001, "loss": 3.9801, "loss/crossentropy": 1.7506417036056519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18603645265102386, "step": 12646 }, { "epoch": 0.25296, "grad_norm": 1.9609375, "grad_norm_var": 0.014359283447265624, "learning_rate": 0.0001, "loss": 4.1238, "loss/crossentropy": 2.162258505821228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21168682724237442, "step": 12648 }, { "epoch": 0.253, "grad_norm": 2.171875, "grad_norm_var": 0.015154774983723958, "learning_rate": 0.0001, "loss": 4.0441, "loss/crossentropy": 1.9122841954231262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20636393874883652, "step": 12650 }, { "epoch": 0.25304, "grad_norm": 2.125, "grad_norm_var": 0.015187327067057292, "learning_rate": 0.0001, "loss": 4.3566, "loss/crossentropy": 2.1501541137695312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22346170246601105, "step": 12652 }, { "epoch": 0.25308, "grad_norm": 2.015625, "grad_norm_var": 0.01456298828125, "learning_rate": 0.0001, "loss": 4.0914, "loss/crossentropy": 1.797228217124939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17510685324668884, "step": 12654 }, { "epoch": 0.25312, "grad_norm": 1.9765625, "grad_norm_var": 0.010422515869140624, "learning_rate": 0.0001, "loss": 3.9232, "loss/crossentropy": 1.7334046363830566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19618064910173416, "step": 12656 }, { "epoch": 0.25316, "grad_norm": 2.109375, "grad_norm_var": 0.0066912333170572914, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 1.99091237783432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21426931023597717, "step": 12658 }, { "epoch": 0.2532, "grad_norm": 2.09375, "grad_norm_var": 0.0069244384765625, "learning_rate": 0.0001, "loss": 4.5036, "loss/crossentropy": 2.1908310651779175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20360572636127472, "step": 12660 }, { "epoch": 0.25324, "grad_norm": 1.9140625, "grad_norm_var": 0.00787353515625, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 1.7925593852996826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18203188478946686, "step": 12662 }, { "epoch": 0.25328, "grad_norm": 2.078125, "grad_norm_var": 0.009006500244140625, "learning_rate": 0.0001, "loss": 3.8955, "loss/crossentropy": 1.9464862942695618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1896686628460884, "step": 12664 }, { "epoch": 0.25332, "grad_norm": 2.046875, "grad_norm_var": 0.006379954020182292, "learning_rate": 0.0001, "loss": 3.9922, "loss/crossentropy": 2.0207581520080566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436276495456696, "step": 12666 }, { "epoch": 0.25336, "grad_norm": 2.09375, "grad_norm_var": 0.005322011311848959, "learning_rate": 0.0001, "loss": 4.141, "loss/crossentropy": 1.9915068745613098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21265029907226562, "step": 12668 }, { "epoch": 0.2534, "grad_norm": 2.015625, "grad_norm_var": 0.005204010009765625, "learning_rate": 0.0001, "loss": 4.2602, "loss/crossentropy": 1.787190020084381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20195094496011734, "step": 12670 }, { "epoch": 0.25344, "grad_norm": 2.0625, "grad_norm_var": 0.005036417643229167, "learning_rate": 0.0001, "loss": 4.3554, "loss/crossentropy": 2.2403881549835205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2313152700662613, "step": 12672 }, { "epoch": 0.25348, "grad_norm": 1.9921875, "grad_norm_var": 0.00540771484375, "learning_rate": 0.0001, "loss": 3.9895, "loss/crossentropy": 1.6122660636901855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19166506081819534, "step": 12674 }, { "epoch": 0.25352, "grad_norm": 2.171875, "grad_norm_var": 0.005402628580729167, "learning_rate": 0.0001, "loss": 4.1966, "loss/crossentropy": 2.0758888125419617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19904160499572754, "step": 12676 }, { "epoch": 0.25356, "grad_norm": 1.9296875, "grad_norm_var": 0.0064656575520833336, "learning_rate": 0.0001, "loss": 4.2466, "loss/crossentropy": 1.8765565156936646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17384758591651917, "step": 12678 }, { "epoch": 0.2536, "grad_norm": 1.9296875, "grad_norm_var": 0.0054705301920572914, "learning_rate": 0.0001, "loss": 4.0977, "loss/crossentropy": 2.1322853565216064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21987880766391754, "step": 12680 }, { "epoch": 0.25364, "grad_norm": 2.0, "grad_norm_var": 0.006196848551432292, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 2.0303893089294434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912615954875946, "step": 12682 }, { "epoch": 0.25368, "grad_norm": 2.109375, "grad_norm_var": 0.0065826416015625, "learning_rate": 0.0001, "loss": 4.0022, "loss/crossentropy": 2.2314319610595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22904963046312332, "step": 12684 }, { "epoch": 0.25372, "grad_norm": 2.140625, "grad_norm_var": 0.007438151041666666, "learning_rate": 0.0001, "loss": 4.1854, "loss/crossentropy": 1.9751350283622742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20683745294809341, "step": 12686 }, { "epoch": 0.25376, "grad_norm": 2.09375, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 4.112, "loss/crossentropy": 1.9198334217071533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19134660065174103, "step": 12688 }, { "epoch": 0.2538, "grad_norm": 1.890625, "grad_norm_var": 0.009214019775390625, "learning_rate": 0.0001, "loss": 3.8783, "loss/crossentropy": 1.945135474205017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20948659628629684, "step": 12690 }, { "epoch": 0.25384, "grad_norm": 1.9140625, "grad_norm_var": 0.0094879150390625, "learning_rate": 0.0001, "loss": 4.1018, "loss/crossentropy": 2.146402955055237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21819791197776794, "step": 12692 }, { "epoch": 0.25388, "grad_norm": 2.09375, "grad_norm_var": 0.007806142171223958, "learning_rate": 0.0001, "loss": 4.3696, "loss/crossentropy": 2.309106230735779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160014659166336, "step": 12694 }, { "epoch": 0.25392, "grad_norm": 1.9765625, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 4.0637, "loss/crossentropy": 2.1726107597351074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20791994035243988, "step": 12696 }, { "epoch": 0.25396, "grad_norm": 2.09375, "grad_norm_var": 0.007370758056640625, "learning_rate": 0.0001, "loss": 4.252, "loss/crossentropy": 2.0905996561050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2083379328250885, "step": 12698 }, { "epoch": 0.254, "grad_norm": 2.078125, "grad_norm_var": 0.008185831705729167, "learning_rate": 0.0001, "loss": 4.197, "loss/crossentropy": 1.9190585017204285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1856430396437645, "step": 12700 }, { "epoch": 0.25404, "grad_norm": 2.125, "grad_norm_var": 0.007877604166666666, "learning_rate": 0.0001, "loss": 4.3906, "loss/crossentropy": 2.2493419647216797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20197859406471252, "step": 12702 }, { "epoch": 0.25408, "grad_norm": 2.171875, "grad_norm_var": 0.007731119791666667, "learning_rate": 0.0001, "loss": 4.1935, "loss/crossentropy": 2.0205613374710083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155362293124199, "step": 12704 }, { "epoch": 0.25412, "grad_norm": 2.078125, "grad_norm_var": 0.006257120768229167, "learning_rate": 0.0001, "loss": 4.188, "loss/crossentropy": 1.8853323459625244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20137012749910355, "step": 12706 }, { "epoch": 0.25416, "grad_norm": 1.9140625, "grad_norm_var": 0.0061431884765625, "learning_rate": 0.0001, "loss": 4.098, "loss/crossentropy": 2.109869122505188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2036309316754341, "step": 12708 }, { "epoch": 0.2542, "grad_norm": 2.078125, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 4.1782, "loss/crossentropy": 2.0598954558372498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22793744504451752, "step": 12710 }, { "epoch": 0.25424, "grad_norm": 2.171875, "grad_norm_var": 0.0055735270182291664, "learning_rate": 0.0001, "loss": 4.2984, "loss/crossentropy": 2.169256567955017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20931441336870193, "step": 12712 }, { "epoch": 0.25428, "grad_norm": 2.09375, "grad_norm_var": 0.005197906494140625, "learning_rate": 0.0001, "loss": 4.2715, "loss/crossentropy": 1.8178632855415344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20344894379377365, "step": 12714 }, { "epoch": 0.25432, "grad_norm": 2.046875, "grad_norm_var": 0.004327138264973958, "learning_rate": 0.0001, "loss": 4.0897, "loss/crossentropy": 1.6468743085861206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16709627211093903, "step": 12716 }, { "epoch": 0.25436, "grad_norm": 2.171875, "grad_norm_var": 0.004748280843098958, "learning_rate": 0.0001, "loss": 4.1702, "loss/crossentropy": 1.8353837728500366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20837673544883728, "step": 12718 }, { "epoch": 0.2544, "grad_norm": 1.9140625, "grad_norm_var": 0.005492146809895833, "learning_rate": 0.0001, "loss": 3.8056, "loss/crossentropy": 1.7881666421890259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18411727994680405, "step": 12720 }, { "epoch": 0.25444, "grad_norm": 2.109375, "grad_norm_var": 0.0065958658854166664, "learning_rate": 0.0001, "loss": 4.5369, "loss/crossentropy": 2.19934618473053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22607161849737167, "step": 12722 }, { "epoch": 0.25448, "grad_norm": 2.328125, "grad_norm_var": 0.008821360270182292, "learning_rate": 0.0001, "loss": 4.5237, "loss/crossentropy": 1.9789779782295227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156950756907463, "step": 12724 }, { "epoch": 0.25452, "grad_norm": 1.921875, "grad_norm_var": 0.010778554280598958, "learning_rate": 0.0001, "loss": 4.1059, "loss/crossentropy": 2.0605525970458984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20622699707746506, "step": 12726 }, { "epoch": 0.25456, "grad_norm": 1.9296875, "grad_norm_var": 0.011336263020833333, "learning_rate": 0.0001, "loss": 4.2862, "loss/crossentropy": 2.1131407022476196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21070127934217453, "step": 12728 }, { "epoch": 0.2546, "grad_norm": 2.09375, "grad_norm_var": 0.011336263020833333, "learning_rate": 0.0001, "loss": 4.3789, "loss/crossentropy": 1.9708096981048584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22219926118850708, "step": 12730 }, { "epoch": 0.25464, "grad_norm": 2.171875, "grad_norm_var": 0.011918131510416667, "learning_rate": 0.0001, "loss": 4.2985, "loss/crossentropy": 2.220608353614807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2271113097667694, "step": 12732 }, { "epoch": 0.25468, "grad_norm": 2.046875, "grad_norm_var": 0.01146240234375, "learning_rate": 0.0001, "loss": 4.2097, "loss/crossentropy": 2.535509705543518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24400310218334198, "step": 12734 }, { "epoch": 0.25472, "grad_norm": 2.390625, "grad_norm_var": 0.015952301025390626, "learning_rate": 0.0001, "loss": 4.6837, "loss/crossentropy": 2.3619974851608276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268039956688881, "step": 12736 }, { "epoch": 0.25476, "grad_norm": 1.9453125, "grad_norm_var": 0.0168212890625, "learning_rate": 0.0001, "loss": 3.9929, "loss/crossentropy": 2.0095953941345215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20850248634815216, "step": 12738 }, { "epoch": 0.2548, "grad_norm": 2.03125, "grad_norm_var": 0.012276204427083333, "learning_rate": 0.0001, "loss": 4.1466, "loss/crossentropy": 1.934277892112732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116999626159668, "step": 12740 }, { "epoch": 0.25484, "grad_norm": 2.03125, "grad_norm_var": 0.011701456705729167, "learning_rate": 0.0001, "loss": 4.336, "loss/crossentropy": 2.237201452255249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22638197988271713, "step": 12742 }, { "epoch": 0.25488, "grad_norm": 2.0, "grad_norm_var": 0.011464182535807292, "learning_rate": 0.0001, "loss": 4.0354, "loss/crossentropy": 2.302059292793274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22285569459199905, "step": 12744 }, { "epoch": 0.25492, "grad_norm": 1.8671875, "grad_norm_var": 0.013944244384765625, "learning_rate": 0.0001, "loss": 3.7097, "loss/crossentropy": 1.7690886855125427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19209770113229752, "step": 12746 }, { "epoch": 0.25496, "grad_norm": 2.171875, "grad_norm_var": 0.013944244384765625, "learning_rate": 0.0001, "loss": 4.2324, "loss/crossentropy": 2.4372475147247314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192872166633606, "step": 12748 }, { "epoch": 0.255, "grad_norm": 1.8984375, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 4.2627, "loss/crossentropy": 2.12698757648468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21698793768882751, "step": 12750 }, { "epoch": 0.25504, "grad_norm": 2.078125, "grad_norm_var": 0.007710520426432292, "learning_rate": 0.0001, "loss": 3.9991, "loss/crossentropy": 1.989953875541687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969183310866356, "step": 12752 }, { "epoch": 0.25508, "grad_norm": 2.671875, "grad_norm_var": 0.034501139322916666, "learning_rate": 0.0001, "loss": 4.5505, "loss/crossentropy": 2.2911970615386963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21863195300102234, "step": 12754 }, { "epoch": 0.25512, "grad_norm": 2.09375, "grad_norm_var": 0.03460286458333333, "learning_rate": 0.0001, "loss": 4.0915, "loss/crossentropy": 1.8447460532188416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20167672634124756, "step": 12756 }, { "epoch": 0.25516, "grad_norm": 1.96875, "grad_norm_var": 0.035471343994140626, "learning_rate": 0.0001, "loss": 3.8488, "loss/crossentropy": 2.043856382369995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075670212507248, "step": 12758 }, { "epoch": 0.2552, "grad_norm": 2.15625, "grad_norm_var": 0.037536366780598955, "learning_rate": 0.0001, "loss": 4.0893, "loss/crossentropy": 1.735123872756958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19347237050533295, "step": 12760 }, { "epoch": 0.25524, "grad_norm": 2.015625, "grad_norm_var": 0.034407552083333334, "learning_rate": 0.0001, "loss": 4.0586, "loss/crossentropy": 2.1058340072631836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21859320253133774, "step": 12762 }, { "epoch": 0.25528, "grad_norm": 2.171875, "grad_norm_var": 0.03444722493489583, "learning_rate": 0.0001, "loss": 4.2518, "loss/crossentropy": 2.0583395957946777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224592886865139, "step": 12764 }, { "epoch": 0.25532, "grad_norm": 1.9921875, "grad_norm_var": 0.032206217447916664, "learning_rate": 0.0001, "loss": 4.3562, "loss/crossentropy": 2.2550116777420044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23644013702869415, "step": 12766 }, { "epoch": 0.25536, "grad_norm": 2.015625, "grad_norm_var": 0.032293446858723956, "learning_rate": 0.0001, "loss": 4.2338, "loss/crossentropy": 2.463193655014038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229352444410324, "step": 12768 }, { "epoch": 0.2554, "grad_norm": 2.0, "grad_norm_var": 0.006400299072265625, "learning_rate": 0.0001, "loss": 4.016, "loss/crossentropy": 2.010310709476471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20725534856319427, "step": 12770 }, { "epoch": 0.25544, "grad_norm": 1.921875, "grad_norm_var": 0.006219228108723958, "learning_rate": 0.0001, "loss": 4.1988, "loss/crossentropy": 2.2002042531967163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21686428785324097, "step": 12772 }, { "epoch": 0.25548, "grad_norm": 1.9921875, "grad_norm_var": 0.009690093994140624, "learning_rate": 0.0001, "loss": 4.2974, "loss/crossentropy": 2.072916865348816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2210928201675415, "step": 12774 }, { "epoch": 0.25552, "grad_norm": 2.015625, "grad_norm_var": 0.010149892171223958, "learning_rate": 0.0001, "loss": 4.3168, "loss/crossentropy": 2.068341016769409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20776809751987457, "step": 12776 }, { "epoch": 0.25556, "grad_norm": 1.9296875, "grad_norm_var": 0.010949452718098959, "learning_rate": 0.0001, "loss": 3.8552, "loss/crossentropy": 1.6145030856132507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056947946548462, "step": 12778 }, { "epoch": 0.2556, "grad_norm": 2.03125, "grad_norm_var": 0.011311848958333334, "learning_rate": 0.0001, "loss": 4.0132, "loss/crossentropy": 2.1529496908187866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21802741289138794, "step": 12780 }, { "epoch": 0.25564, "grad_norm": 2.03125, "grad_norm_var": 0.011226399739583334, "learning_rate": 0.0001, "loss": 4.1708, "loss/crossentropy": 2.255640387535095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21794818341732025, "step": 12782 }, { "epoch": 0.25568, "grad_norm": 2.03125, "grad_norm_var": 0.011546834309895834, "learning_rate": 0.0001, "loss": 4.1102, "loss/crossentropy": 1.9934669137001038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21103205531835556, "step": 12784 }, { "epoch": 0.25572, "grad_norm": 2.125, "grad_norm_var": 0.011864217122395833, "learning_rate": 0.0001, "loss": 4.2924, "loss/crossentropy": 1.9404807090759277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22771108150482178, "step": 12786 }, { "epoch": 0.25576, "grad_norm": 2.078125, "grad_norm_var": 0.012084706624348959, "learning_rate": 0.0001, "loss": 4.0444, "loss/crossentropy": 1.7803818583488464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19245180487632751, "step": 12788 }, { "epoch": 0.2558, "grad_norm": 2.078125, "grad_norm_var": 0.009423828125, "learning_rate": 0.0001, "loss": 4.2033, "loss/crossentropy": 2.108216881752014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239757925271988, "step": 12790 }, { "epoch": 0.25584, "grad_norm": 2.0625, "grad_norm_var": 0.0059478759765625, "learning_rate": 0.0001, "loss": 4.1999, "loss/crossentropy": 2.056196451187134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20573420077562332, "step": 12792 }, { "epoch": 0.25588, "grad_norm": 1.9296875, "grad_norm_var": 0.007275390625, "learning_rate": 0.0001, "loss": 3.9749, "loss/crossentropy": 2.2215099334716797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21107079833745956, "step": 12794 }, { "epoch": 0.25592, "grad_norm": 1.796875, "grad_norm_var": 0.009281158447265625, "learning_rate": 0.0001, "loss": 3.9527, "loss/crossentropy": 1.947695553302765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19738437235355377, "step": 12796 }, { "epoch": 0.25596, "grad_norm": 2.0625, "grad_norm_var": 0.010204823811848958, "learning_rate": 0.0001, "loss": 4.1072, "loss/crossentropy": 2.3351560831069946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21714143455028534, "step": 12798 }, { "epoch": 0.256, "grad_norm": 2.0625, "grad_norm_var": 0.009798177083333333, "learning_rate": 0.0001, "loss": 4.3055, "loss/crossentropy": 2.399898648262024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25758983194828033, "step": 12800 }, { "epoch": 0.25604, "grad_norm": 2.671875, "grad_norm_var": 0.03814697265625, "learning_rate": 0.0001, "loss": 4.5632, "loss/crossentropy": 2.3537880182266235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2976065129041672, "step": 12802 }, { "epoch": 0.25608, "grad_norm": 1.921875, "grad_norm_var": 0.03857421875, "learning_rate": 0.0001, "loss": 3.8394, "loss/crossentropy": 1.7765586972236633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1823679357767105, "step": 12804 }, { "epoch": 0.25612, "grad_norm": 2.1875, "grad_norm_var": 0.039793904622395834, "learning_rate": 0.0001, "loss": 4.4023, "loss/crossentropy": 2.119332432746887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23052836954593658, "step": 12806 }, { "epoch": 0.25616, "grad_norm": 2.109375, "grad_norm_var": 0.0414947509765625, "learning_rate": 0.0001, "loss": 4.0251, "loss/crossentropy": 1.8325288891792297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19811426103115082, "step": 12808 }, { "epoch": 0.2562, "grad_norm": 1.953125, "grad_norm_var": 0.039896392822265626, "learning_rate": 0.0001, "loss": 4.1089, "loss/crossentropy": 2.079294800758362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21122989803552628, "step": 12810 }, { "epoch": 0.25624, "grad_norm": 2.078125, "grad_norm_var": 0.03581110636393229, "learning_rate": 0.0001, "loss": 4.0105, "loss/crossentropy": 2.029142141342163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008090615272522, "step": 12812 }, { "epoch": 0.25628, "grad_norm": 1.9453125, "grad_norm_var": 0.03578058878580729, "learning_rate": 0.0001, "loss": 4.1743, "loss/crossentropy": 1.8697097301483154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19589021801948547, "step": 12814 }, { "epoch": 0.25632, "grad_norm": 1.8203125, "grad_norm_var": 0.038331858317057294, "learning_rate": 0.0001, "loss": 3.8837, "loss/crossentropy": 1.8488793969154358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19999422132968903, "step": 12816 }, { "epoch": 0.25636, "grad_norm": 2.109375, "grad_norm_var": 0.011946360270182291, "learning_rate": 0.0001, "loss": 4.236, "loss/crossentropy": 2.352132201194763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23569129407405853, "step": 12818 }, { "epoch": 0.2564, "grad_norm": 2.015625, "grad_norm_var": 0.010155232747395833, "learning_rate": 0.0001, "loss": 4.2247, "loss/crossentropy": 2.213895559310913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19782276451587677, "step": 12820 }, { "epoch": 0.25644, "grad_norm": 1.96875, "grad_norm_var": 0.008876291910807292, "learning_rate": 0.0001, "loss": 4.205, "loss/crossentropy": 2.328965425491333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138378769159317, "step": 12822 }, { "epoch": 0.25648, "grad_norm": 1.9375, "grad_norm_var": 0.007407379150390625, "learning_rate": 0.0001, "loss": 3.9178, "loss/crossentropy": 2.148880124092102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20276722311973572, "step": 12824 }, { "epoch": 0.25652, "grad_norm": 2.109375, "grad_norm_var": 0.007124582926432292, "learning_rate": 0.0001, "loss": 4.2742, "loss/crossentropy": 2.1533660888671875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22211932390928268, "step": 12826 }, { "epoch": 0.25656, "grad_norm": 2.75, "grad_norm_var": 0.04098078409830729, "learning_rate": 0.0001, "loss": 4.0928, "loss/crossentropy": 1.7518101930618286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17689460515975952, "step": 12828 }, { "epoch": 0.2566, "grad_norm": 2.109375, "grad_norm_var": 0.0395660400390625, "learning_rate": 0.0001, "loss": 3.9745, "loss/crossentropy": 2.346145749092102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22157911956310272, "step": 12830 }, { "epoch": 0.25664, "grad_norm": 2.09375, "grad_norm_var": 0.0358062744140625, "learning_rate": 0.0001, "loss": 4.1689, "loss/crossentropy": 1.8355774283409119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20288754999637604, "step": 12832 }, { "epoch": 0.25668, "grad_norm": 1.984375, "grad_norm_var": 0.03906962076822917, "learning_rate": 0.0001, "loss": 3.9947, "loss/crossentropy": 1.949233889579773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19860410690307617, "step": 12834 }, { "epoch": 0.25672, "grad_norm": 1.9609375, "grad_norm_var": 0.04248046875, "learning_rate": 0.0001, "loss": 4.0903, "loss/crossentropy": 2.0141645669937134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19531898200511932, "step": 12836 }, { "epoch": 0.25676, "grad_norm": 2.015625, "grad_norm_var": 0.04277725219726562, "learning_rate": 0.0001, "loss": 3.918, "loss/crossentropy": 1.6573863625526428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.163545623421669, "step": 12838 }, { "epoch": 0.2568, "grad_norm": 2.15625, "grad_norm_var": 0.04273656209309896, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.2174651622772217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23083333671092987, "step": 12840 }, { "epoch": 0.25684, "grad_norm": 1.953125, "grad_norm_var": 0.045967356363932295, "learning_rate": 0.0001, "loss": 3.7593, "loss/crossentropy": 1.8067168593406677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963481903076172, "step": 12842 }, { "epoch": 0.25688, "grad_norm": 2.078125, "grad_norm_var": 0.010536448160807291, "learning_rate": 0.0001, "loss": 3.936, "loss/crossentropy": 1.928274691104889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21783485263586044, "step": 12844 }, { "epoch": 0.25692, "grad_norm": 1.90625, "grad_norm_var": 0.013498687744140625, "learning_rate": 0.0001, "loss": 4.0984, "loss/crossentropy": 2.183669090270996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040836587548256, "step": 12846 }, { "epoch": 0.25696, "grad_norm": 2.015625, "grad_norm_var": 0.012894439697265624, "learning_rate": 0.0001, "loss": 4.0624, "loss/crossentropy": 1.7395422458648682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19434035569429398, "step": 12848 }, { "epoch": 0.257, "grad_norm": 1.8515625, "grad_norm_var": 0.012772623697916667, "learning_rate": 0.0001, "loss": 3.9914, "loss/crossentropy": 1.9124428629875183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19825156033039093, "step": 12850 }, { "epoch": 0.25704, "grad_norm": 2.09375, "grad_norm_var": 0.011606597900390625, "learning_rate": 0.0001, "loss": 3.9325, "loss/crossentropy": 1.8064388036727905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19374487549066544, "step": 12852 }, { "epoch": 0.25708, "grad_norm": 2.0, "grad_norm_var": 0.0111724853515625, "learning_rate": 0.0001, "loss": 3.8722, "loss/crossentropy": 2.0943931341171265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20997963845729828, "step": 12854 }, { "epoch": 0.25712, "grad_norm": 1.96875, "grad_norm_var": 0.00948486328125, "learning_rate": 0.0001, "loss": 3.9974, "loss/crossentropy": 1.9252876043319702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20608004182577133, "step": 12856 }, { "epoch": 0.25716, "grad_norm": 2.015625, "grad_norm_var": 0.007575480143229166, "learning_rate": 0.0001, "loss": 3.9536, "loss/crossentropy": 1.813286304473877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2107577547430992, "step": 12858 }, { "epoch": 0.2572, "grad_norm": 1.984375, "grad_norm_var": 0.007405598958333333, "learning_rate": 0.0001, "loss": 4.1571, "loss/crossentropy": 1.985919713973999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18724270164966583, "step": 12860 }, { "epoch": 0.25724, "grad_norm": 1.921875, "grad_norm_var": 0.004654693603515625, "learning_rate": 0.0001, "loss": 3.9377, "loss/crossentropy": 1.912338137626648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19602636992931366, "step": 12862 }, { "epoch": 0.25728, "grad_norm": 1.9453125, "grad_norm_var": 0.004743448893229167, "learning_rate": 0.0001, "loss": 4.0823, "loss/crossentropy": 1.962310791015625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975855752825737, "step": 12864 }, { "epoch": 0.25732, "grad_norm": 2.140625, "grad_norm_var": 0.0050046284993489586, "learning_rate": 0.0001, "loss": 4.285, "loss/crossentropy": 2.063133656978607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012972503900528, "step": 12866 }, { "epoch": 0.25736, "grad_norm": 2.15625, "grad_norm_var": 0.005751291910807292, "learning_rate": 0.0001, "loss": 4.1443, "loss/crossentropy": 2.1755728721618652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21089357137680054, "step": 12868 }, { "epoch": 0.2574, "grad_norm": 2.09375, "grad_norm_var": 0.006148274739583333, "learning_rate": 0.0001, "loss": 4.3854, "loss/crossentropy": 2.21665620803833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22525488585233688, "step": 12870 }, { "epoch": 0.25744, "grad_norm": 2.140625, "grad_norm_var": 0.006959788004557292, "learning_rate": 0.0001, "loss": 4.2465, "loss/crossentropy": 2.2826790809631348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22989312559366226, "step": 12872 }, { "epoch": 0.25748, "grad_norm": 1.875, "grad_norm_var": 0.008432769775390625, "learning_rate": 0.0001, "loss": 4.0753, "loss/crossentropy": 2.3187366724014282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22048770636320114, "step": 12874 }, { "epoch": 0.25752, "grad_norm": 2.09375, "grad_norm_var": 0.0093017578125, "learning_rate": 0.0001, "loss": 4.2469, "loss/crossentropy": 2.234878957271576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19875259697437286, "step": 12876 }, { "epoch": 0.25756, "grad_norm": 2.046875, "grad_norm_var": 0.007897694905598959, "learning_rate": 0.0001, "loss": 4.0981, "loss/crossentropy": 1.8919751644134521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20682695508003235, "step": 12878 }, { "epoch": 0.2576, "grad_norm": 1.9453125, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 3.9564, "loss/crossentropy": 2.135833740234375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100691795349121, "step": 12880 }, { "epoch": 0.25764, "grad_norm": 1.96875, "grad_norm_var": 0.008421834309895833, "learning_rate": 0.0001, "loss": 4.163, "loss/crossentropy": 2.043276846408844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20327290892601013, "step": 12882 }, { "epoch": 0.25768, "grad_norm": 1.921875, "grad_norm_var": 0.0077512105305989586, "learning_rate": 0.0001, "loss": 4.2439, "loss/crossentropy": 1.996088445186615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20499806851148605, "step": 12884 }, { "epoch": 0.25772, "grad_norm": 1.984375, "grad_norm_var": 0.006648508707682291, "learning_rate": 0.0001, "loss": 3.9821, "loss/crossentropy": 1.7864345908164978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19354674220085144, "step": 12886 }, { "epoch": 0.25776, "grad_norm": 2.03125, "grad_norm_var": 0.0051310221354166664, "learning_rate": 0.0001, "loss": 3.961, "loss/crossentropy": 1.7890866994857788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1806325614452362, "step": 12888 }, { "epoch": 0.2578, "grad_norm": 2.09375, "grad_norm_var": 0.004938761393229167, "learning_rate": 0.0001, "loss": 4.1836, "loss/crossentropy": 1.9287649989128113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555142521858215, "step": 12890 }, { "epoch": 0.25784, "grad_norm": 2.140625, "grad_norm_var": 0.005081939697265625, "learning_rate": 0.0001, "loss": 3.9151, "loss/crossentropy": 1.930963397026062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19455663859844208, "step": 12892 }, { "epoch": 0.25788, "grad_norm": 2.109375, "grad_norm_var": 0.007551829020182292, "learning_rate": 0.0001, "loss": 4.3216, "loss/crossentropy": 2.1105018854141235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22564146667718887, "step": 12894 }, { "epoch": 0.25792, "grad_norm": 2.03125, "grad_norm_var": 0.008365885416666666, "learning_rate": 0.0001, "loss": 4.1924, "loss/crossentropy": 2.0535677671432495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2083517387509346, "step": 12896 }, { "epoch": 0.25796, "grad_norm": 2.015625, "grad_norm_var": 0.007966105143229167, "learning_rate": 0.0001, "loss": 4.2951, "loss/crossentropy": 2.062114655971527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20135939121246338, "step": 12898 }, { "epoch": 0.258, "grad_norm": 1.7734375, "grad_norm_var": 0.012482706705729167, "learning_rate": 0.0001, "loss": 3.8187, "loss/crossentropy": 1.968269407749176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20238329470157623, "step": 12900 }, { "epoch": 0.25804, "grad_norm": 2.03125, "grad_norm_var": 0.0598541259765625, "learning_rate": 0.0001, "loss": 4.1899, "loss/crossentropy": 2.0783804655075073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23015306890010834, "step": 12902 }, { "epoch": 0.25808, "grad_norm": 2.140625, "grad_norm_var": 0.05815836588541667, "learning_rate": 0.0001, "loss": 4.1245, "loss/crossentropy": 1.8616145253181458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19802623987197876, "step": 12904 }, { "epoch": 0.25812, "grad_norm": 2.046875, "grad_norm_var": 0.05834147135416667, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.0005027651786804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21621856093406677, "step": 12906 }, { "epoch": 0.25816, "grad_norm": 2.1875, "grad_norm_var": 0.06181208292643229, "learning_rate": 0.0001, "loss": 4.0563, "loss/crossentropy": 1.796087920665741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18594232201576233, "step": 12908 }, { "epoch": 0.2582, "grad_norm": 2.1875, "grad_norm_var": 0.06258316040039062, "learning_rate": 0.0001, "loss": 4.3161, "loss/crossentropy": 2.003196358680725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19296320527791977, "step": 12910 }, { "epoch": 0.25824, "grad_norm": 1.9296875, "grad_norm_var": 0.06313654581705729, "learning_rate": 0.0001, "loss": 4.0489, "loss/crossentropy": 1.8252119421958923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879737451672554, "step": 12912 }, { "epoch": 0.25828, "grad_norm": 2.046875, "grad_norm_var": 0.0629900614420573, "learning_rate": 0.0001, "loss": 4.2053, "loss/crossentropy": 1.9801989793777466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20867998152971268, "step": 12914 }, { "epoch": 0.25832, "grad_norm": 1.9921875, "grad_norm_var": 0.05583902994791667, "learning_rate": 0.0001, "loss": 3.9754, "loss/crossentropy": 2.04214608669281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21134068816900253, "step": 12916 }, { "epoch": 0.25836, "grad_norm": 2.171875, "grad_norm_var": 0.009159342447916666, "learning_rate": 0.0001, "loss": 4.2202, "loss/crossentropy": 2.0669824481010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436694473028183, "step": 12918 }, { "epoch": 0.2584, "grad_norm": 2.09375, "grad_norm_var": 0.008771769205729167, "learning_rate": 0.0001, "loss": 4.3616, "loss/crossentropy": 1.951616883277893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19572293758392334, "step": 12920 }, { "epoch": 0.25844, "grad_norm": 1.953125, "grad_norm_var": 0.010074869791666666, "learning_rate": 0.0001, "loss": 3.8476, "loss/crossentropy": 1.7768954634666443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930573582649231, "step": 12922 }, { "epoch": 0.25848, "grad_norm": 2.40625, "grad_norm_var": 0.020458984375, "learning_rate": 0.0001, "loss": 3.8775, "loss/crossentropy": 1.8527125716209412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18682826310396194, "step": 12924 }, { "epoch": 0.25852, "grad_norm": 2.0, "grad_norm_var": 0.018990071614583333, "learning_rate": 0.0001, "loss": 4.3814, "loss/crossentropy": 2.3325828313827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21535523980855942, "step": 12926 }, { "epoch": 0.25856, "grad_norm": 1.8984375, "grad_norm_var": 0.02063166300455729, "learning_rate": 0.0001, "loss": 3.7701, "loss/crossentropy": 1.759222686290741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17470692098140717, "step": 12928 }, { "epoch": 0.2586, "grad_norm": 1.921875, "grad_norm_var": 0.021247355143229167, "learning_rate": 0.0001, "loss": 3.8065, "loss/crossentropy": 1.944493055343628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20259656757116318, "step": 12930 }, { "epoch": 0.25864, "grad_norm": 1.984375, "grad_norm_var": 0.05238825480143229, "learning_rate": 0.0001, "loss": 4.133, "loss/crossentropy": 2.086555302143097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180738002061844, "step": 12932 }, { "epoch": 0.25868, "grad_norm": 1.9453125, "grad_norm_var": 0.052779134114583334, "learning_rate": 0.0001, "loss": 4.1414, "loss/crossentropy": 1.9723011255264282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22814874351024628, "step": 12934 }, { "epoch": 0.25872, "grad_norm": 2.125, "grad_norm_var": 0.05356038411458333, "learning_rate": 0.0001, "loss": 4.418, "loss/crossentropy": 1.5646896958351135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18182098120450974, "step": 12936 }, { "epoch": 0.25876, "grad_norm": 1.8828125, "grad_norm_var": 0.054870351155598955, "learning_rate": 0.0001, "loss": 3.8356, "loss/crossentropy": 1.8396940231323242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19390598684549332, "step": 12938 }, { "epoch": 0.2588, "grad_norm": 2.046875, "grad_norm_var": 0.041112263997395836, "learning_rate": 0.0001, "loss": 4.5902, "loss/crossentropy": 2.086738705635071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209588885307312, "step": 12940 }, { "epoch": 0.25884, "grad_norm": 2.078125, "grad_norm_var": 0.04136530558268229, "learning_rate": 0.0001, "loss": 4.2611, "loss/crossentropy": 2.0965282917022705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21021046489477158, "step": 12942 }, { "epoch": 0.25888, "grad_norm": 1.8125, "grad_norm_var": 0.04311421712239583, "learning_rate": 0.0001, "loss": 4.0702, "loss/crossentropy": 2.2878576517105103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2203715667128563, "step": 12944 }, { "epoch": 0.25892, "grad_norm": 1.96875, "grad_norm_var": 0.045446523030598956, "learning_rate": 0.0001, "loss": 4.419, "loss/crossentropy": 2.1753373742103577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21585986018180847, "step": 12946 }, { "epoch": 0.25896, "grad_norm": 2.0625, "grad_norm_var": 0.036232248942057295, "learning_rate": 0.0001, "loss": 4.3735, "loss/crossentropy": 1.963489055633545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20374132692813873, "step": 12948 }, { "epoch": 0.259, "grad_norm": 2.265625, "grad_norm_var": 0.037751261393229166, "learning_rate": 0.0001, "loss": 4.4073, "loss/crossentropy": 2.3734259605407715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2375379502773285, "step": 12950 }, { "epoch": 0.25904, "grad_norm": 1.9921875, "grad_norm_var": 0.040710194905598955, "learning_rate": 0.0001, "loss": 4.184, "loss/crossentropy": 2.266390085220337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22274205088615417, "step": 12952 }, { "epoch": 0.25908, "grad_norm": 2.046875, "grad_norm_var": 0.03729654947916667, "learning_rate": 0.0001, "loss": 3.9735, "loss/crossentropy": 1.9015297293663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20370282977819443, "step": 12954 }, { "epoch": 0.25912, "grad_norm": 2.03125, "grad_norm_var": 0.03886617024739583, "learning_rate": 0.0001, "loss": 4.3269, "loss/crossentropy": 2.1192378997802734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26288172602653503, "step": 12956 }, { "epoch": 0.25916, "grad_norm": 2.046875, "grad_norm_var": 0.03792088826497396, "learning_rate": 0.0001, "loss": 4.0982, "loss/crossentropy": 1.713826835155487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557386100292206, "step": 12958 }, { "epoch": 0.2592, "grad_norm": 1.9921875, "grad_norm_var": 0.029545084635416666, "learning_rate": 0.0001, "loss": 4.2002, "loss/crossentropy": 2.0365039706230164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21086978912353516, "step": 12960 }, { "epoch": 0.25924, "grad_norm": 2.078125, "grad_norm_var": 0.027852376302083332, "learning_rate": 0.0001, "loss": 4.2422, "loss/crossentropy": 1.6235400438308716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18112218379974365, "step": 12962 }, { "epoch": 0.25928, "grad_norm": 2.03125, "grad_norm_var": 0.0116455078125, "learning_rate": 0.0001, "loss": 4.2996, "loss/crossentropy": 2.042620003223419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21463964879512787, "step": 12964 }, { "epoch": 0.25932, "grad_norm": 2.046875, "grad_norm_var": 0.009611002604166667, "learning_rate": 0.0001, "loss": 4.0222, "loss/crossentropy": 1.6238983273506165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171783745288849, "step": 12966 }, { "epoch": 0.25936, "grad_norm": 2.4375, "grad_norm_var": 0.014070383707682292, "learning_rate": 0.0001, "loss": 4.0098, "loss/crossentropy": 2.3150339126586914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22038520872592926, "step": 12968 }, { "epoch": 0.2594, "grad_norm": 2.09375, "grad_norm_var": 0.013301595052083334, "learning_rate": 0.0001, "loss": 4.2202, "loss/crossentropy": 2.3907227516174316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2269512265920639, "step": 12970 }, { "epoch": 0.25944, "grad_norm": 1.921875, "grad_norm_var": 0.014378865559895834, "learning_rate": 0.0001, "loss": 4.1032, "loss/crossentropy": 2.1069165468215942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21007738262414932, "step": 12972 }, { "epoch": 0.25948, "grad_norm": 1.8984375, "grad_norm_var": 0.017942047119140624, "learning_rate": 0.0001, "loss": 3.9475, "loss/crossentropy": 2.277982234954834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21795185655355453, "step": 12974 }, { "epoch": 0.25952, "grad_norm": 1.9921875, "grad_norm_var": 0.017775217692057293, "learning_rate": 0.0001, "loss": 4.0159, "loss/crossentropy": 1.7227254509925842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19050486385822296, "step": 12976 }, { "epoch": 0.25956, "grad_norm": 1.9453125, "grad_norm_var": 0.0184722900390625, "learning_rate": 0.0001, "loss": 4.0281, "loss/crossentropy": 2.0112340450286865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21032612770795822, "step": 12978 }, { "epoch": 0.2596, "grad_norm": 2.140625, "grad_norm_var": 0.019530232747395834, "learning_rate": 0.0001, "loss": 4.3041, "loss/crossentropy": 2.4241796731948853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22474834322929382, "step": 12980 }, { "epoch": 0.25964, "grad_norm": 2.078125, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 4.3814, "loss/crossentropy": 1.893187701702118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20330534130334854, "step": 12982 }, { "epoch": 0.25968, "grad_norm": 1.9375, "grad_norm_var": 0.008467356363932291, "learning_rate": 0.0001, "loss": 4.1527, "loss/crossentropy": 2.045100212097168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20636773109436035, "step": 12984 }, { "epoch": 0.25972, "grad_norm": 1.8125, "grad_norm_var": 0.010035959879557292, "learning_rate": 0.0001, "loss": 3.864, "loss/crossentropy": 1.819455087184906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1775035709142685, "step": 12986 }, { "epoch": 0.25976, "grad_norm": 1.9609375, "grad_norm_var": 0.00985107421875, "learning_rate": 0.0001, "loss": 4.0553, "loss/crossentropy": 2.1358631253242493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2027900367975235, "step": 12988 }, { "epoch": 0.2598, "grad_norm": 2.1875, "grad_norm_var": 0.010902659098307291, "learning_rate": 0.0001, "loss": 4.1552, "loss/crossentropy": 1.7847145199775696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21203257888555527, "step": 12990 }, { "epoch": 0.25984, "grad_norm": 2.109375, "grad_norm_var": 0.011979166666666667, "learning_rate": 0.0001, "loss": 4.325, "loss/crossentropy": 2.2848687171936035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22337154299020767, "step": 12992 }, { "epoch": 0.25988, "grad_norm": 2.046875, "grad_norm_var": 0.012889607747395834, "learning_rate": 0.0001, "loss": 3.975, "loss/crossentropy": 1.9629716277122498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20660604536533356, "step": 12994 }, { "epoch": 0.25992, "grad_norm": 2.0, "grad_norm_var": 0.010461171468098959, "learning_rate": 0.0001, "loss": 4.1521, "loss/crossentropy": 2.0321004390716553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21755626797676086, "step": 12996 }, { "epoch": 0.25996, "grad_norm": 1.890625, "grad_norm_var": 0.0105712890625, "learning_rate": 0.0001, "loss": 4.1208, "loss/crossentropy": 1.950038492679596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987697035074234, "step": 12998 }, { "epoch": 0.26, "grad_norm": 2.078125, "grad_norm_var": 0.010544586181640624, "learning_rate": 0.0001, "loss": 4.3383, "loss/crossentropy": 2.2752585411071777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20959648489952087, "step": 13000 }, { "epoch": 0.26004, "grad_norm": 2.125, "grad_norm_var": 0.008138020833333334, "learning_rate": 0.0001, "loss": 4.3132, "loss/crossentropy": 2.162104368209839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21731911599636078, "step": 13002 }, { "epoch": 0.26008, "grad_norm": 1.890625, "grad_norm_var": 0.009102121988932291, "learning_rate": 0.0001, "loss": 4.0138, "loss/crossentropy": 2.3739081025123596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22912397980690002, "step": 13004 }, { "epoch": 0.26012, "grad_norm": 3.0625, "grad_norm_var": 0.07608413696289062, "learning_rate": 0.0001, "loss": 4.3052, "loss/crossentropy": 2.161367416381836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22413842380046844, "step": 13006 }, { "epoch": 0.26016, "grad_norm": 2.0625, "grad_norm_var": 0.07587661743164062, "learning_rate": 0.0001, "loss": 4.1435, "loss/crossentropy": 1.9589285850524902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21598373353481293, "step": 13008 }, { "epoch": 0.2602, "grad_norm": 2.09375, "grad_norm_var": 0.0731842041015625, "learning_rate": 0.0001, "loss": 4.3268, "loss/crossentropy": 2.3472602367401123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22626091539859772, "step": 13010 }, { "epoch": 0.26024, "grad_norm": 1.984375, "grad_norm_var": 0.07296727498372396, "learning_rate": 0.0001, "loss": 3.8455, "loss/crossentropy": 1.9968576431274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777586847543716, "step": 13012 }, { "epoch": 0.26028, "grad_norm": 2.140625, "grad_norm_var": 0.0729400634765625, "learning_rate": 0.0001, "loss": 3.8219, "loss/crossentropy": 1.7597955465316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19565816968679428, "step": 13014 }, { "epoch": 0.26032, "grad_norm": 2.09375, "grad_norm_var": 0.0735992431640625, "learning_rate": 0.0001, "loss": 4.1446, "loss/crossentropy": 1.8115187287330627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19675451517105103, "step": 13016 }, { "epoch": 0.26036, "grad_norm": 1.9453125, "grad_norm_var": 0.07507909138997396, "learning_rate": 0.0001, "loss": 4.2018, "loss/crossentropy": 1.9728147983551025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19999082386493683, "step": 13018 }, { "epoch": 0.2604, "grad_norm": 1.8828125, "grad_norm_var": 0.07516988118489583, "learning_rate": 0.0001, "loss": 4.1434, "loss/crossentropy": 2.155986785888672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20160435140132904, "step": 13020 }, { "epoch": 0.26044, "grad_norm": 2.015625, "grad_norm_var": 0.0081298828125, "learning_rate": 0.0001, "loss": 4.0647, "loss/crossentropy": 2.119171440601349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2231176346540451, "step": 13022 }, { "epoch": 0.26048, "grad_norm": 1.8046875, "grad_norm_var": 0.010884348551432292, "learning_rate": 0.0001, "loss": 3.8185, "loss/crossentropy": 2.0395994186401367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21075501292943954, "step": 13024 }, { "epoch": 0.26052, "grad_norm": 2.28125, "grad_norm_var": 0.014808909098307291, "learning_rate": 0.0001, "loss": 4.4988, "loss/crossentropy": 2.42897891998291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24882060289382935, "step": 13026 }, { "epoch": 0.26056, "grad_norm": 2.25, "grad_norm_var": 0.018202463785807293, "learning_rate": 0.0001, "loss": 4.4995, "loss/crossentropy": 2.367077350616455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21996428817510605, "step": 13028 }, { "epoch": 0.2606, "grad_norm": 1.875, "grad_norm_var": 0.017207590738932292, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 2.071397542953491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20548687875270844, "step": 13030 }, { "epoch": 0.26064, "grad_norm": 2.078125, "grad_norm_var": 0.016943359375, "learning_rate": 0.0001, "loss": 4.0425, "loss/crossentropy": 1.764617681503296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19541263580322266, "step": 13032 }, { "epoch": 0.26068, "grad_norm": 2.015625, "grad_norm_var": 0.016916656494140626, "learning_rate": 0.0001, "loss": 4.2657, "loss/crossentropy": 2.2966678142547607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187207117676735, "step": 13034 }, { "epoch": 0.26072, "grad_norm": 2.09375, "grad_norm_var": 0.0159820556640625, "learning_rate": 0.0001, "loss": 4.2959, "loss/crossentropy": 1.923275649547577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20180264115333557, "step": 13036 }, { "epoch": 0.26076, "grad_norm": 2.171875, "grad_norm_var": 0.017207845052083334, "learning_rate": 0.0001, "loss": 4.3138, "loss/crossentropy": 2.246683716773987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22133737802505493, "step": 13038 }, { "epoch": 0.2608, "grad_norm": 2.046875, "grad_norm_var": 0.011755116780598958, "learning_rate": 0.0001, "loss": 4.3685, "loss/crossentropy": 2.271313190460205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23316439241170883, "step": 13040 }, { "epoch": 0.26084, "grad_norm": 2.0625, "grad_norm_var": 0.008652496337890624, "learning_rate": 0.0001, "loss": 4.4451, "loss/crossentropy": 2.2649654150009155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.211005300283432, "step": 13042 }, { "epoch": 0.26088, "grad_norm": 1.8828125, "grad_norm_var": 0.008006795247395834, "learning_rate": 0.0001, "loss": 3.9505, "loss/crossentropy": 1.7872197031974792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17795034497976303, "step": 13044 }, { "epoch": 0.26092, "grad_norm": 2.046875, "grad_norm_var": 0.007875315348307292, "learning_rate": 0.0001, "loss": 3.8693, "loss/crossentropy": 1.793078064918518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18739672750234604, "step": 13046 }, { "epoch": 0.26096, "grad_norm": 2.140625, "grad_norm_var": 0.0111572265625, "learning_rate": 0.0001, "loss": 4.4178, "loss/crossentropy": 2.0676932334899902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156887650489807, "step": 13048 }, { "epoch": 0.261, "grad_norm": 2.046875, "grad_norm_var": 0.013484700520833334, "learning_rate": 0.0001, "loss": 4.2851, "loss/crossentropy": 2.2497498989105225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21906063705682755, "step": 13050 }, { "epoch": 0.26104, "grad_norm": 2.015625, "grad_norm_var": 0.014994303385416666, "learning_rate": 0.0001, "loss": 4.0203, "loss/crossentropy": 1.901672899723053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2070387825369835, "step": 13052 }, { "epoch": 0.26108, "grad_norm": 2.109375, "grad_norm_var": 0.013887532552083333, "learning_rate": 0.0001, "loss": 4.3772, "loss/crossentropy": 2.2334693670272827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21539244055747986, "step": 13054 }, { "epoch": 0.26112, "grad_norm": 2.0625, "grad_norm_var": 0.014288075764973958, "learning_rate": 0.0001, "loss": 4.2916, "loss/crossentropy": 2.0297399759292603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18854249268770218, "step": 13056 }, { "epoch": 0.26116, "grad_norm": 2.09375, "grad_norm_var": 0.014452107747395833, "learning_rate": 0.0001, "loss": 4.1115, "loss/crossentropy": 2.0964863896369934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21821416169404984, "step": 13058 }, { "epoch": 0.2612, "grad_norm": 2.09375, "grad_norm_var": 0.013702138264973959, "learning_rate": 0.0001, "loss": 4.4918, "loss/crossentropy": 2.1526511907577515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22606930136680603, "step": 13060 }, { "epoch": 0.26124, "grad_norm": 2.109375, "grad_norm_var": 0.0101470947265625, "learning_rate": 0.0001, "loss": 4.2079, "loss/crossentropy": 2.2275509238243103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959830820560455, "step": 13062 }, { "epoch": 0.26128, "grad_norm": 2.046875, "grad_norm_var": 0.0086181640625, "learning_rate": 0.0001, "loss": 4.4246, "loss/crossentropy": 2.542263627052307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22889885306358337, "step": 13064 }, { "epoch": 0.26132, "grad_norm": 1.875, "grad_norm_var": 0.008585611979166666, "learning_rate": 0.0001, "loss": 4.0292, "loss/crossentropy": 1.7915868163108826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20242593437433243, "step": 13066 }, { "epoch": 0.26136, "grad_norm": 2.046875, "grad_norm_var": 0.006453450520833333, "learning_rate": 0.0001, "loss": 4.1415, "loss/crossentropy": 2.183286130428314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.227259561419487, "step": 13068 }, { "epoch": 0.2614, "grad_norm": 2.046875, "grad_norm_var": 0.0063720703125, "learning_rate": 0.0001, "loss": 4.3211, "loss/crossentropy": 2.2707515954971313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2300432324409485, "step": 13070 }, { "epoch": 0.26144, "grad_norm": 1.9765625, "grad_norm_var": 0.0066487630208333336, "learning_rate": 0.0001, "loss": 4.0237, "loss/crossentropy": 1.822485864162445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19380664825439453, "step": 13072 }, { "epoch": 0.26148, "grad_norm": 2.203125, "grad_norm_var": 0.008349355061848958, "learning_rate": 0.0001, "loss": 4.0915, "loss/crossentropy": 2.089439034461975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531338036060333, "step": 13074 }, { "epoch": 0.26152, "grad_norm": 2.21875, "grad_norm_var": 0.008318837483723958, "learning_rate": 0.0001, "loss": 4.3872, "loss/crossentropy": 2.0921221375465393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22256288677453995, "step": 13076 }, { "epoch": 0.26156, "grad_norm": 2.1875, "grad_norm_var": 0.009065500895182292, "learning_rate": 0.0001, "loss": 4.4644, "loss/crossentropy": 1.8978914022445679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2677152305841446, "step": 13078 }, { "epoch": 0.2616, "grad_norm": 2.296875, "grad_norm_var": 0.011643218994140624, "learning_rate": 0.0001, "loss": 4.405, "loss/crossentropy": 2.2540348768234253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24195441603660583, "step": 13080 }, { "epoch": 0.26164, "grad_norm": 1.9140625, "grad_norm_var": 0.01092529296875, "learning_rate": 0.0001, "loss": 4.0965, "loss/crossentropy": 1.9546288847923279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19464369863271713, "step": 13082 }, { "epoch": 0.26168, "grad_norm": 2.0625, "grad_norm_var": 0.012457021077473958, "learning_rate": 0.0001, "loss": 4.0103, "loss/crossentropy": 2.1527568101882935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22238580137491226, "step": 13084 }, { "epoch": 0.26172, "grad_norm": 1.9765625, "grad_norm_var": 0.0128814697265625, "learning_rate": 0.0001, "loss": 4.3957, "loss/crossentropy": 2.111591637134552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21360620856285095, "step": 13086 }, { "epoch": 0.26176, "grad_norm": 1.953125, "grad_norm_var": 0.01339111328125, "learning_rate": 0.0001, "loss": 3.99, "loss/crossentropy": 2.006071925163269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19746285676956177, "step": 13088 }, { "epoch": 0.2618, "grad_norm": 2.203125, "grad_norm_var": 0.0128814697265625, "learning_rate": 0.0001, "loss": 4.2073, "loss/crossentropy": 2.276778221130371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21222709119319916, "step": 13090 }, { "epoch": 0.26184, "grad_norm": 2.015625, "grad_norm_var": 0.013346354166666666, "learning_rate": 0.0001, "loss": 4.0679, "loss/crossentropy": 2.102404534816742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049730271100998, "step": 13092 }, { "epoch": 0.26188, "grad_norm": 2.015625, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 4.439, "loss/crossentropy": 2.204727053642273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21260947734117508, "step": 13094 }, { "epoch": 0.26192, "grad_norm": 1.9609375, "grad_norm_var": 0.010184478759765626, "learning_rate": 0.0001, "loss": 4.2801, "loss/crossentropy": 2.3762032985687256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22091203182935715, "step": 13096 }, { "epoch": 0.26196, "grad_norm": 1.90625, "grad_norm_var": 0.010286458333333333, "learning_rate": 0.0001, "loss": 4.3472, "loss/crossentropy": 2.0710391998291016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20782632380723953, "step": 13098 }, { "epoch": 0.262, "grad_norm": 2.09375, "grad_norm_var": 0.009319814046223958, "learning_rate": 0.0001, "loss": 4.3257, "loss/crossentropy": 2.193024158477783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21109049022197723, "step": 13100 }, { "epoch": 0.26204, "grad_norm": 2.109375, "grad_norm_var": 0.009308878580729167, "learning_rate": 0.0001, "loss": 4.1863, "loss/crossentropy": 1.9668392539024353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19249404221773148, "step": 13102 }, { "epoch": 0.26208, "grad_norm": 2.015625, "grad_norm_var": 0.00858154296875, "learning_rate": 0.0001, "loss": 4.3122, "loss/crossentropy": 2.2675795555114746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26278799772262573, "step": 13104 }, { "epoch": 0.26212, "grad_norm": 2.109375, "grad_norm_var": 0.007857004801432291, "learning_rate": 0.0001, "loss": 3.9962, "loss/crossentropy": 2.0119330883026123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20821017771959305, "step": 13106 }, { "epoch": 0.26216, "grad_norm": 1.984375, "grad_norm_var": 0.006237538655598959, "learning_rate": 0.0001, "loss": 4.329, "loss/crossentropy": 2.1259734630584717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22454438358545303, "step": 13108 }, { "epoch": 0.2622, "grad_norm": 2.03125, "grad_norm_var": 0.0043413798014322914, "learning_rate": 0.0001, "loss": 4.2558, "loss/crossentropy": 1.8091979622840881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19353152066469193, "step": 13110 }, { "epoch": 0.26224, "grad_norm": 2.0625, "grad_norm_var": 0.0038655598958333335, "learning_rate": 0.0001, "loss": 4.1847, "loss/crossentropy": 1.9463382363319397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20932665467262268, "step": 13112 }, { "epoch": 0.26228, "grad_norm": 2.125, "grad_norm_var": 0.004366048177083333, "learning_rate": 0.0001, "loss": 4.3975, "loss/crossentropy": 2.475601077079773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23451963067054749, "step": 13114 }, { "epoch": 0.26232, "grad_norm": 1.9921875, "grad_norm_var": 0.004412587483723958, "learning_rate": 0.0001, "loss": 4.4397, "loss/crossentropy": 2.0833849906921387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22380392253398895, "step": 13116 }, { "epoch": 0.26236, "grad_norm": 1.8984375, "grad_norm_var": 0.005301920572916666, "learning_rate": 0.0001, "loss": 4.0408, "loss/crossentropy": 1.9876453876495361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20823375135660172, "step": 13118 }, { "epoch": 0.2624, "grad_norm": 2.15625, "grad_norm_var": 0.006750233968098958, "learning_rate": 0.0001, "loss": 4.5321, "loss/crossentropy": 2.184763789176941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24357233941555023, "step": 13120 }, { "epoch": 0.26244, "grad_norm": 2.015625, "grad_norm_var": 0.0061187744140625, "learning_rate": 0.0001, "loss": 4.3552, "loss/crossentropy": 2.217886805534363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21944888681173325, "step": 13122 }, { "epoch": 0.26248, "grad_norm": 2.0625, "grad_norm_var": 0.0052886962890625, "learning_rate": 0.0001, "loss": 4.2443, "loss/crossentropy": 2.3299126625061035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158309668302536, "step": 13124 }, { "epoch": 0.26252, "grad_norm": 2.09375, "grad_norm_var": 0.0058349609375, "learning_rate": 0.0001, "loss": 4.1303, "loss/crossentropy": 2.0423209071159363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20835570991039276, "step": 13126 }, { "epoch": 0.26256, "grad_norm": 1.796875, "grad_norm_var": 0.012325032552083334, "learning_rate": 0.0001, "loss": 4.31, "loss/crossentropy": 2.227811813354492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043633684515953, "step": 13128 }, { "epoch": 0.2626, "grad_norm": 2.078125, "grad_norm_var": 0.011153157552083333, "learning_rate": 0.0001, "loss": 4.1312, "loss/crossentropy": 2.105396568775177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2212589681148529, "step": 13130 }, { "epoch": 0.26264, "grad_norm": 2.234375, "grad_norm_var": 0.012737782796223958, "learning_rate": 0.0001, "loss": 4.2244, "loss/crossentropy": 2.1711814999580383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21528291702270508, "step": 13132 }, { "epoch": 0.26268, "grad_norm": 1.9375, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 4.1288, "loss/crossentropy": 2.0086284279823303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974012851715088, "step": 13134 }, { "epoch": 0.26272, "grad_norm": 2.0625, "grad_norm_var": 0.011205037434895834, "learning_rate": 0.0001, "loss": 3.895, "loss/crossentropy": 1.7025322914123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878160759806633, "step": 13136 }, { "epoch": 0.26276, "grad_norm": 2.21875, "grad_norm_var": 0.015950520833333332, "learning_rate": 0.0001, "loss": 4.0134, "loss/crossentropy": 1.821410596370697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197972871363163, "step": 13138 }, { "epoch": 0.2628, "grad_norm": 1.8671875, "grad_norm_var": 0.018993886311848958, "learning_rate": 0.0001, "loss": 4.0875, "loss/crossentropy": 2.0485053658485413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20401400327682495, "step": 13140 }, { "epoch": 0.26284, "grad_norm": 2.03125, "grad_norm_var": 0.018808746337890626, "learning_rate": 0.0001, "loss": 4.4344, "loss/crossentropy": 2.350088357925415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638252303004265, "step": 13142 }, { "epoch": 0.26288, "grad_norm": 2.078125, "grad_norm_var": 0.012308502197265625, "learning_rate": 0.0001, "loss": 4.5756, "loss/crossentropy": 2.3423168659210205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22735413163900375, "step": 13144 }, { "epoch": 0.26292, "grad_norm": 2.0, "grad_norm_var": 0.012033843994140625, "learning_rate": 0.0001, "loss": 4.3848, "loss/crossentropy": 2.134036064147949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20407382398843765, "step": 13146 }, { "epoch": 0.26296, "grad_norm": 1.9921875, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 4.105, "loss/crossentropy": 2.0744638442993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2107524871826172, "step": 13148 }, { "epoch": 0.263, "grad_norm": 1.9375, "grad_norm_var": 0.012604777018229167, "learning_rate": 0.0001, "loss": 4.0507, "loss/crossentropy": 2.1522003412246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046833261847496, "step": 13150 }, { "epoch": 0.26304, "grad_norm": 2.140625, "grad_norm_var": 0.013224283854166666, "learning_rate": 0.0001, "loss": 4.2754, "loss/crossentropy": 2.328645348548889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24340055882930756, "step": 13152 }, { "epoch": 0.26308, "grad_norm": 1.9296875, "grad_norm_var": 0.009161122639973958, "learning_rate": 0.0001, "loss": 4.0502, "loss/crossentropy": 2.0878008008003235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110893502831459, "step": 13154 }, { "epoch": 0.26312, "grad_norm": 1.984375, "grad_norm_var": 0.0074155171712239586, "learning_rate": 0.0001, "loss": 4.3065, "loss/crossentropy": 1.9381126761436462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18893951922655106, "step": 13156 }, { "epoch": 0.26316, "grad_norm": 2.21875, "grad_norm_var": 0.009364573160807292, "learning_rate": 0.0001, "loss": 4.0254, "loss/crossentropy": 2.153562545776367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973574310541153, "step": 13158 }, { "epoch": 0.2632, "grad_norm": 2.203125, "grad_norm_var": 0.011370595296223958, "learning_rate": 0.0001, "loss": 3.998, "loss/crossentropy": 2.0583202242851257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19830025732517242, "step": 13160 }, { "epoch": 0.26324, "grad_norm": 2.171875, "grad_norm_var": 0.012117258707682292, "learning_rate": 0.0001, "loss": 4.4843, "loss/crossentropy": 2.1951998472213745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916336566209793, "step": 13162 }, { "epoch": 0.26328, "grad_norm": 2.078125, "grad_norm_var": 0.011888631184895833, "learning_rate": 0.0001, "loss": 4.4944, "loss/crossentropy": 2.1612678170204163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21040990203619003, "step": 13164 }, { "epoch": 0.26332, "grad_norm": 2.078125, "grad_norm_var": 0.009415690104166667, "learning_rate": 0.0001, "loss": 4.2012, "loss/crossentropy": 1.9372112154960632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24868668615818024, "step": 13166 }, { "epoch": 0.26336, "grad_norm": 2.015625, "grad_norm_var": 0.009968058268229166, "learning_rate": 0.0001, "loss": 4.3605, "loss/crossentropy": 2.0220513939857483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22029200196266174, "step": 13168 }, { "epoch": 0.2634, "grad_norm": 1.921875, "grad_norm_var": 0.010355631510416666, "learning_rate": 0.0001, "loss": 3.7251, "loss/crossentropy": 1.6749334335327148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18502884358167648, "step": 13170 }, { "epoch": 0.26344, "grad_norm": 1.9375, "grad_norm_var": 0.010096995035807292, "learning_rate": 0.0001, "loss": 4.1394, "loss/crossentropy": 2.1463050842285156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21098337322473526, "step": 13172 }, { "epoch": 0.26348, "grad_norm": 2.125, "grad_norm_var": 0.009089914957682292, "learning_rate": 0.0001, "loss": 4.3261, "loss/crossentropy": 2.20473051071167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23006527870893478, "step": 13174 }, { "epoch": 0.26352, "grad_norm": 2.3125, "grad_norm_var": 0.012068684895833333, "learning_rate": 0.0001, "loss": 4.3289, "loss/crossentropy": 2.097456157207489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077179104089737, "step": 13176 }, { "epoch": 0.26356, "grad_norm": 2.015625, "grad_norm_var": 0.0124176025390625, "learning_rate": 0.0001, "loss": 4.402, "loss/crossentropy": 2.136400580406189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152952179312706, "step": 13178 }, { "epoch": 0.2636, "grad_norm": 1.921875, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 4.23, "loss/crossentropy": 2.0791231393814087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19688992202281952, "step": 13180 }, { "epoch": 0.26364, "grad_norm": 2.03125, "grad_norm_var": 0.0126617431640625, "learning_rate": 0.0001, "loss": 4.1233, "loss/crossentropy": 2.135149598121643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22163429111242294, "step": 13182 }, { "epoch": 0.26368, "grad_norm": 2.171875, "grad_norm_var": 0.013936360677083334, "learning_rate": 0.0001, "loss": 4.4215, "loss/crossentropy": 2.3207671642303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24321655184030533, "step": 13184 }, { "epoch": 0.26372, "grad_norm": 2.03125, "grad_norm_var": 0.013069407145182291, "learning_rate": 0.0001, "loss": 4.0995, "loss/crossentropy": 2.0136974453926086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19274070858955383, "step": 13186 }, { "epoch": 0.26376, "grad_norm": 2.203125, "grad_norm_var": 0.013610585530598959, "learning_rate": 0.0001, "loss": 4.5094, "loss/crossentropy": 2.3542726039886475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24547156691551208, "step": 13188 }, { "epoch": 0.2638, "grad_norm": 1.8984375, "grad_norm_var": 0.016007486979166666, "learning_rate": 0.0001, "loss": 4.068, "loss/crossentropy": 1.905708134174347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18261709809303284, "step": 13190 }, { "epoch": 0.26384, "grad_norm": 1.921875, "grad_norm_var": 0.012147776285807292, "learning_rate": 0.0001, "loss": 4.1426, "loss/crossentropy": 2.137321710586548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052912563085556, "step": 13192 }, { "epoch": 0.26388, "grad_norm": 2.0625, "grad_norm_var": 0.010827382405598959, "learning_rate": 0.0001, "loss": 4.2252, "loss/crossentropy": 2.083053410053253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038491740822792, "step": 13194 }, { "epoch": 0.26392, "grad_norm": 2.0, "grad_norm_var": 0.011346181233723959, "learning_rate": 0.0001, "loss": 3.8349, "loss/crossentropy": 1.8006438612937927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910744607448578, "step": 13196 }, { "epoch": 0.26396, "grad_norm": 2.03125, "grad_norm_var": 0.012902577718098959, "learning_rate": 0.0001, "loss": 4.4104, "loss/crossentropy": 2.323628544807434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280128449201584, "step": 13198 }, { "epoch": 0.264, "grad_norm": 1.921875, "grad_norm_var": 0.010137685139973958, "learning_rate": 0.0001, "loss": 4.1231, "loss/crossentropy": 1.9763594269752502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19776062667369843, "step": 13200 }, { "epoch": 0.26404, "grad_norm": 2.109375, "grad_norm_var": 0.010601552327473958, "learning_rate": 0.0001, "loss": 4.0175, "loss/crossentropy": 1.8776894807815552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19732315093278885, "step": 13202 }, { "epoch": 0.26408, "grad_norm": 1.96875, "grad_norm_var": 0.007478841145833333, "learning_rate": 0.0001, "loss": 4.0713, "loss/crossentropy": 1.826314091682434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19153253734111786, "step": 13204 }, { "epoch": 0.26412, "grad_norm": 1.8828125, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 4.1503, "loss/crossentropy": 2.435014486312866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22149913012981415, "step": 13206 }, { "epoch": 0.26416, "grad_norm": 2.4375, "grad_norm_var": 0.01866633097330729, "learning_rate": 0.0001, "loss": 4.0829, "loss/crossentropy": 2.055279493331909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21625792980194092, "step": 13208 }, { "epoch": 0.2642, "grad_norm": 1.984375, "grad_norm_var": 0.018641916910807292, "learning_rate": 0.0001, "loss": 4.216, "loss/crossentropy": 2.4019049406051636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22626115381717682, "step": 13210 }, { "epoch": 0.26424, "grad_norm": 1.9921875, "grad_norm_var": 0.017508697509765626, "learning_rate": 0.0001, "loss": 3.9822, "loss/crossentropy": 2.336918354034424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22773776203393936, "step": 13212 }, { "epoch": 0.26428, "grad_norm": 2.03125, "grad_norm_var": 0.01608454386393229, "learning_rate": 0.0001, "loss": 4.3929, "loss/crossentropy": 2.0398528575897217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20827088505029678, "step": 13214 }, { "epoch": 0.26432, "grad_norm": 2.03125, "grad_norm_var": 0.015419260660807291, "learning_rate": 0.0001, "loss": 4.026, "loss/crossentropy": 2.166857123374939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2136707603931427, "step": 13216 }, { "epoch": 0.26436, "grad_norm": 1.9921875, "grad_norm_var": 0.015095011393229166, "learning_rate": 0.0001, "loss": 4.2282, "loss/crossentropy": 1.8454258441925049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20678912848234177, "step": 13218 }, { "epoch": 0.2644, "grad_norm": 2.140625, "grad_norm_var": 0.014642079671223959, "learning_rate": 0.0001, "loss": 4.2966, "loss/crossentropy": 2.249597668647766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22785719484090805, "step": 13220 }, { "epoch": 0.26444, "grad_norm": 1.90625, "grad_norm_var": 0.014134724934895834, "learning_rate": 0.0001, "loss": 3.875, "loss/crossentropy": 1.7729946374893188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014644593000412, "step": 13222 }, { "epoch": 0.26448, "grad_norm": 1.9296875, "grad_norm_var": 0.004976145426432292, "learning_rate": 0.0001, "loss": 3.9144, "loss/crossentropy": 1.7968116998672485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889382004737854, "step": 13224 }, { "epoch": 0.26452, "grad_norm": 2.015625, "grad_norm_var": 0.006510162353515625, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.0765512585639954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20551903545856476, "step": 13226 }, { "epoch": 0.26456, "grad_norm": 2.203125, "grad_norm_var": 0.007933553059895833, "learning_rate": 0.0001, "loss": 4.2603, "loss/crossentropy": 2.2007554173469543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2250915989279747, "step": 13228 }, { "epoch": 0.2646, "grad_norm": 1.9765625, "grad_norm_var": 0.008184560139973958, "learning_rate": 0.0001, "loss": 4.2014, "loss/crossentropy": 2.319425046443939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22913866490125656, "step": 13230 }, { "epoch": 0.26464, "grad_norm": 1.96875, "grad_norm_var": 0.008365631103515625, "learning_rate": 0.0001, "loss": 4.1395, "loss/crossentropy": 1.9845139980316162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20769847929477692, "step": 13232 }, { "epoch": 0.26468, "grad_norm": 1.96875, "grad_norm_var": 0.008674112955729167, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 1.94157075881958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2314443141222, "step": 13234 }, { "epoch": 0.26472, "grad_norm": 2.0, "grad_norm_var": 0.007470703125, "learning_rate": 0.0001, "loss": 4.3428, "loss/crossentropy": 2.0699294209480286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22432812303304672, "step": 13236 }, { "epoch": 0.26476, "grad_norm": 1.9453125, "grad_norm_var": 0.006841786702473958, "learning_rate": 0.0001, "loss": 3.983, "loss/crossentropy": 1.7656533122062683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20837824791669846, "step": 13238 }, { "epoch": 0.2648, "grad_norm": 1.9921875, "grad_norm_var": 0.0060618082682291664, "learning_rate": 0.0001, "loss": 4.1799, "loss/crossentropy": 1.9473227858543396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20151428878307343, "step": 13240 }, { "epoch": 0.26484, "grad_norm": 1.921875, "grad_norm_var": 0.005387369791666667, "learning_rate": 0.0001, "loss": 4.284, "loss/crossentropy": 2.060440719127655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041771411895752, "step": 13242 }, { "epoch": 0.26488, "grad_norm": 1.8984375, "grad_norm_var": 0.003922526041666667, "learning_rate": 0.0001, "loss": 3.7503, "loss/crossentropy": 2.0440456867218018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041776031255722, "step": 13244 }, { "epoch": 0.26492, "grad_norm": 2.109375, "grad_norm_var": 0.004622395833333333, "learning_rate": 0.0001, "loss": 4.1921, "loss/crossentropy": 1.9192551374435425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18886462599039078, "step": 13246 }, { "epoch": 0.26496, "grad_norm": 1.9296875, "grad_norm_var": 0.0072100321451822914, "learning_rate": 0.0001, "loss": 4.1673, "loss/crossentropy": 1.7112281918525696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18532422930002213, "step": 13248 }, { "epoch": 0.265, "grad_norm": 2.0, "grad_norm_var": 0.007637532552083334, "learning_rate": 0.0001, "loss": 4.0561, "loss/crossentropy": 1.9225355386734009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19994951784610748, "step": 13250 }, { "epoch": 0.26504, "grad_norm": 2.0, "grad_norm_var": 0.007950846354166667, "learning_rate": 0.0001, "loss": 3.6825, "loss/crossentropy": 2.0243517756462097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19785288721323013, "step": 13252 }, { "epoch": 0.26508, "grad_norm": 1.984375, "grad_norm_var": 0.007413482666015625, "learning_rate": 0.0001, "loss": 4.2929, "loss/crossentropy": 2.255920171737671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21220286190509796, "step": 13254 }, { "epoch": 0.26512, "grad_norm": 2.0625, "grad_norm_var": 0.008475748697916667, "learning_rate": 0.0001, "loss": 4.4145, "loss/crossentropy": 2.3024203777313232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22255335003137589, "step": 13256 }, { "epoch": 0.26516, "grad_norm": 2.03125, "grad_norm_var": 0.00733642578125, "learning_rate": 0.0001, "loss": 4.1386, "loss/crossentropy": 2.0467708110809326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193494111299515, "step": 13258 }, { "epoch": 0.2652, "grad_norm": 2.03125, "grad_norm_var": 0.006064605712890625, "learning_rate": 0.0001, "loss": 4.2188, "loss/crossentropy": 2.1543976068496704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2257647141814232, "step": 13260 }, { "epoch": 0.26524, "grad_norm": 1.9609375, "grad_norm_var": 0.006373850504557291, "learning_rate": 0.0001, "loss": 4.2683, "loss/crossentropy": 2.077743351459503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964918076992035, "step": 13262 }, { "epoch": 0.26528, "grad_norm": 1.9921875, "grad_norm_var": 0.0044921875, "learning_rate": 0.0001, "loss": 3.963, "loss/crossentropy": 1.6356236338615417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1847509667277336, "step": 13264 }, { "epoch": 0.26532, "grad_norm": 1.953125, "grad_norm_var": 0.006925201416015625, "learning_rate": 0.0001, "loss": 4.3559, "loss/crossentropy": 2.0876659750938416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22080402821302414, "step": 13266 }, { "epoch": 0.26536, "grad_norm": 2.109375, "grad_norm_var": 0.005747222900390625, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 1.7057855129241943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18772340565919876, "step": 13268 }, { "epoch": 0.2654, "grad_norm": 1.890625, "grad_norm_var": 0.0078277587890625, "learning_rate": 0.0001, "loss": 4.0032, "loss/crossentropy": 2.0550594329833984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18500665575265884, "step": 13270 }, { "epoch": 0.26544, "grad_norm": 2.0, "grad_norm_var": 0.011131795247395833, "learning_rate": 0.0001, "loss": 4.2837, "loss/crossentropy": 2.027509331703186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21377252787351608, "step": 13272 }, { "epoch": 0.26548, "grad_norm": 1.921875, "grad_norm_var": 0.012393951416015625, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 2.0517578125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21172185242176056, "step": 13274 }, { "epoch": 0.26552, "grad_norm": 2.09375, "grad_norm_var": 0.01395263671875, "learning_rate": 0.0001, "loss": 4.1628, "loss/crossentropy": 2.0550750494003296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22374523431062698, "step": 13276 }, { "epoch": 0.26556, "grad_norm": 2.09375, "grad_norm_var": 0.013399251302083333, "learning_rate": 0.0001, "loss": 4.1829, "loss/crossentropy": 1.8055049777030945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17029780894517899, "step": 13278 }, { "epoch": 0.2656, "grad_norm": 1.9765625, "grad_norm_var": 0.013114166259765626, "learning_rate": 0.0001, "loss": 4.3999, "loss/crossentropy": 1.9650321006774902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20327500253915787, "step": 13280 }, { "epoch": 0.26564, "grad_norm": 1.9921875, "grad_norm_var": 0.010209147135416667, "learning_rate": 0.0001, "loss": 4.1395, "loss/crossentropy": 2.164494276046753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21579623967409134, "step": 13282 }, { "epoch": 0.26568, "grad_norm": 1.9296875, "grad_norm_var": 0.009810129801432291, "learning_rate": 0.0001, "loss": 4.0676, "loss/crossentropy": 2.0488376021385193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20213299989700317, "step": 13284 }, { "epoch": 0.26572, "grad_norm": 2.0, "grad_norm_var": 0.010308583577473959, "learning_rate": 0.0001, "loss": 3.7442, "loss/crossentropy": 1.948447048664093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19875720143318176, "step": 13286 }, { "epoch": 0.26576, "grad_norm": 1.875, "grad_norm_var": 0.008487701416015625, "learning_rate": 0.0001, "loss": 4.2312, "loss/crossentropy": 2.115865170955658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19675103574991226, "step": 13288 }, { "epoch": 0.2658, "grad_norm": 2.140625, "grad_norm_var": 0.0105377197265625, "learning_rate": 0.0001, "loss": 4.2289, "loss/crossentropy": 1.9720736145973206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20975899696350098, "step": 13290 }, { "epoch": 0.26584, "grad_norm": 2.015625, "grad_norm_var": 0.009154256184895833, "learning_rate": 0.0001, "loss": 3.9258, "loss/crossentropy": 1.9105132222175598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20582708716392517, "step": 13292 }, { "epoch": 0.26588, "grad_norm": 2.40625, "grad_norm_var": 0.018369293212890624, "learning_rate": 0.0001, "loss": 4.2629, "loss/crossentropy": 2.089789867401123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21264012902975082, "step": 13294 }, { "epoch": 0.26592, "grad_norm": 2.03125, "grad_norm_var": 0.018358357747395835, "learning_rate": 0.0001, "loss": 4.41, "loss/crossentropy": 2.271156430244446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21146492660045624, "step": 13296 }, { "epoch": 0.26596, "grad_norm": 1.90625, "grad_norm_var": 0.0196929931640625, "learning_rate": 0.0001, "loss": 3.8554, "loss/crossentropy": 1.930766224861145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959189623594284, "step": 13298 }, { "epoch": 0.266, "grad_norm": 2.25, "grad_norm_var": 0.023374176025390624, "learning_rate": 0.0001, "loss": 4.3551, "loss/crossentropy": 1.9945995807647705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20959854125976562, "step": 13300 }, { "epoch": 0.26604, "grad_norm": 2.125, "grad_norm_var": 0.020637003580729167, "learning_rate": 0.0001, "loss": 4.5445, "loss/crossentropy": 2.2128443717956543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21266601234674454, "step": 13302 }, { "epoch": 0.26608, "grad_norm": 2.03125, "grad_norm_var": 0.018049112955729165, "learning_rate": 0.0001, "loss": 4.2578, "loss/crossentropy": 2.076206088066101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037106677889824, "step": 13304 }, { "epoch": 0.26612, "grad_norm": 2.171875, "grad_norm_var": 0.018724568684895835, "learning_rate": 0.0001, "loss": 4.0512, "loss/crossentropy": 2.077200174331665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.228537917137146, "step": 13306 }, { "epoch": 0.26616, "grad_norm": 2.0, "grad_norm_var": 0.01874567667643229, "learning_rate": 0.0001, "loss": 3.9641, "loss/crossentropy": 2.0368794202804565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150697112083435, "step": 13308 }, { "epoch": 0.2662, "grad_norm": 1.9375, "grad_norm_var": 0.016556803385416666, "learning_rate": 0.0001, "loss": 3.8516, "loss/crossentropy": 1.9108307361602783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903306469321251, "step": 13310 }, { "epoch": 0.26624, "grad_norm": 2.046875, "grad_norm_var": 0.016218058268229165, "learning_rate": 0.0001, "loss": 3.9928, "loss/crossentropy": 1.8118465542793274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557358533143997, "step": 13312 }, { "epoch": 0.26628, "grad_norm": 1.8671875, "grad_norm_var": 0.01736424763997396, "learning_rate": 0.0001, "loss": 3.897, "loss/crossentropy": 1.7392275929450989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19214990735054016, "step": 13314 }, { "epoch": 0.26632, "grad_norm": 3.359375, "grad_norm_var": 0.12773412068684895, "learning_rate": 0.0001, "loss": 3.9728, "loss/crossentropy": 2.1361005306243896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22042304277420044, "step": 13316 }, { "epoch": 0.26636, "grad_norm": 1.9375, "grad_norm_var": 0.12786026000976564, "learning_rate": 0.0001, "loss": 4.0055, "loss/crossentropy": 2.0087279677391052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20278790593147278, "step": 13318 }, { "epoch": 0.2664, "grad_norm": 2.09375, "grad_norm_var": 0.12776692708333334, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 2.307933807373047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24929606914520264, "step": 13320 }, { "epoch": 0.26644, "grad_norm": 2.078125, "grad_norm_var": 0.12690022786458333, "learning_rate": 0.0001, "loss": 4.3451, "loss/crossentropy": 2.5539438724517822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2219071164727211, "step": 13322 }, { "epoch": 0.26648, "grad_norm": 1.953125, "grad_norm_var": 0.12910334269205728, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 1.9069242477416992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19276316463947296, "step": 13324 }, { "epoch": 0.26652, "grad_norm": 2.140625, "grad_norm_var": 0.12194010416666666, "learning_rate": 0.0001, "loss": 4.535, "loss/crossentropy": 2.0715484619140625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20079267024993896, "step": 13326 }, { "epoch": 0.26656, "grad_norm": 2.15625, "grad_norm_var": 0.12164713541666666, "learning_rate": 0.0001, "loss": 4.46, "loss/crossentropy": 2.1134172677993774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20564979314804077, "step": 13328 }, { "epoch": 0.2666, "grad_norm": 2.046875, "grad_norm_var": 0.11581929524739583, "learning_rate": 0.0001, "loss": 4.3992, "loss/crossentropy": 2.103445053100586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19361437857151031, "step": 13330 }, { "epoch": 0.26664, "grad_norm": 1.8203125, "grad_norm_var": 0.014499664306640625, "learning_rate": 0.0001, "loss": 4.0169, "loss/crossentropy": 2.1882529258728027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21402588486671448, "step": 13332 }, { "epoch": 0.26668, "grad_norm": 2.046875, "grad_norm_var": 0.015498606363932292, "learning_rate": 0.0001, "loss": 4.3247, "loss/crossentropy": 2.2531981468200684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019234597682953, "step": 13334 }, { "epoch": 0.26672, "grad_norm": 1.9921875, "grad_norm_var": 0.021954091389973958, "learning_rate": 0.0001, "loss": 3.8596, "loss/crossentropy": 1.8077877759933472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19121356308460236, "step": 13336 }, { "epoch": 0.26676, "grad_norm": 2.046875, "grad_norm_var": 0.02600072224934896, "learning_rate": 0.0001, "loss": 4.3195, "loss/crossentropy": 1.7387139797210693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20326050370931625, "step": 13338 }, { "epoch": 0.2668, "grad_norm": 1.984375, "grad_norm_var": 0.022712198893229167, "learning_rate": 0.0001, "loss": 4.065, "loss/crossentropy": 2.2061930894851685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22271326184272766, "step": 13340 }, { "epoch": 0.26684, "grad_norm": 2.109375, "grad_norm_var": 0.02191162109375, "learning_rate": 0.0001, "loss": 4.4018, "loss/crossentropy": 2.096329092979431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201632410287857, "step": 13342 }, { "epoch": 0.26688, "grad_norm": 1.9375, "grad_norm_var": 0.0232574462890625, "learning_rate": 0.0001, "loss": 4.0783, "loss/crossentropy": 2.2015284299850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22004209458827972, "step": 13344 }, { "epoch": 0.26692, "grad_norm": 2.125, "grad_norm_var": 0.023957316080729166, "learning_rate": 0.0001, "loss": 4.5227, "loss/crossentropy": 2.256517231464386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21762817353010178, "step": 13346 }, { "epoch": 0.26696, "grad_norm": 1.828125, "grad_norm_var": 0.022874959309895835, "learning_rate": 0.0001, "loss": 3.8578, "loss/crossentropy": 1.8551252484321594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18950944393873215, "step": 13348 }, { "epoch": 0.267, "grad_norm": 2.0, "grad_norm_var": 0.020967610677083335, "learning_rate": 0.0001, "loss": 4.3051, "loss/crossentropy": 1.8834841847419739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17950639128684998, "step": 13350 }, { "epoch": 0.26704, "grad_norm": 2.109375, "grad_norm_var": 0.012872060139973959, "learning_rate": 0.0001, "loss": 4.0234, "loss/crossentropy": 2.0059815645217896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279331535100937, "step": 13352 }, { "epoch": 0.26708, "grad_norm": 2.015625, "grad_norm_var": 0.00594482421875, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 2.2327537536621094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22790935635566711, "step": 13354 }, { "epoch": 0.26712, "grad_norm": 2.109375, "grad_norm_var": 0.0066650390625, "learning_rate": 0.0001, "loss": 4.0535, "loss/crossentropy": 1.7623894214630127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17889687418937683, "step": 13356 }, { "epoch": 0.26716, "grad_norm": 1.984375, "grad_norm_var": 0.006205240885416667, "learning_rate": 0.0001, "loss": 4.1595, "loss/crossentropy": 1.9240076541900635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18552126735448837, "step": 13358 }, { "epoch": 0.2672, "grad_norm": 1.8515625, "grad_norm_var": 0.007387034098307292, "learning_rate": 0.0001, "loss": 3.9162, "loss/crossentropy": 2.301589012145996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20251981914043427, "step": 13360 }, { "epoch": 0.26724, "grad_norm": 2.078125, "grad_norm_var": 0.007671864827473959, "learning_rate": 0.0001, "loss": 4.1575, "loss/crossentropy": 2.144057512283325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22049692273139954, "step": 13362 }, { "epoch": 0.26728, "grad_norm": 2.03125, "grad_norm_var": 0.006209309895833333, "learning_rate": 0.0001, "loss": 4.2381, "loss/crossentropy": 1.9329981207847595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015545517206192, "step": 13364 }, { "epoch": 0.26732, "grad_norm": 2.046875, "grad_norm_var": 0.0062896728515625, "learning_rate": 0.0001, "loss": 4.2544, "loss/crossentropy": 2.057462990283966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19826926290988922, "step": 13366 }, { "epoch": 0.26736, "grad_norm": 2.03125, "grad_norm_var": 0.006670888264973958, "learning_rate": 0.0001, "loss": 4.085, "loss/crossentropy": 2.097291588783264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18807460367679596, "step": 13368 }, { "epoch": 0.2674, "grad_norm": 2.015625, "grad_norm_var": 0.007281239827473958, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 2.154082179069519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22133710980415344, "step": 13370 }, { "epoch": 0.26744, "grad_norm": 1.9453125, "grad_norm_var": 0.017561848958333334, "learning_rate": 0.0001, "loss": 4.1412, "loss/crossentropy": 2.208779454231262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139016017317772, "step": 13372 }, { "epoch": 0.26748, "grad_norm": 2.0, "grad_norm_var": 0.018355305989583334, "learning_rate": 0.0001, "loss": 3.8929, "loss/crossentropy": 2.251901865005493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22264431416988373, "step": 13374 }, { "epoch": 0.26752, "grad_norm": 2.15625, "grad_norm_var": 0.016857655843098958, "learning_rate": 0.0001, "loss": 4.3243, "loss/crossentropy": 2.4881285429000854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.250653475522995, "step": 13376 }, { "epoch": 0.26756, "grad_norm": 2.140625, "grad_norm_var": 0.017427571614583335, "learning_rate": 0.0001, "loss": 4.2806, "loss/crossentropy": 2.104843556880951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19004888087511063, "step": 13378 }, { "epoch": 0.2676, "grad_norm": 2.359375, "grad_norm_var": 0.022899373372395834, "learning_rate": 0.0001, "loss": 4.0708, "loss/crossentropy": 1.913047194480896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20572267472743988, "step": 13380 }, { "epoch": 0.26764, "grad_norm": 1.9765625, "grad_norm_var": 0.02463353474934896, "learning_rate": 0.0001, "loss": 4.3632, "loss/crossentropy": 2.1043838262557983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21029697358608246, "step": 13382 }, { "epoch": 0.26768, "grad_norm": 2.1875, "grad_norm_var": 0.02617162068684896, "learning_rate": 0.0001, "loss": 4.2429, "loss/crossentropy": 2.055271625518799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113131284713745, "step": 13384 }, { "epoch": 0.26772, "grad_norm": 2.71875, "grad_norm_var": 0.0488433837890625, "learning_rate": 0.0001, "loss": 4.4956, "loss/crossentropy": 1.8407886624336243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2729990780353546, "step": 13386 }, { "epoch": 0.26776, "grad_norm": 2.0625, "grad_norm_var": 0.04177017211914062, "learning_rate": 0.0001, "loss": 4.0097, "loss/crossentropy": 2.0368717908859253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21415862441062927, "step": 13388 }, { "epoch": 0.2678, "grad_norm": 1.953125, "grad_norm_var": 0.040897369384765625, "learning_rate": 0.0001, "loss": 4.1836, "loss/crossentropy": 1.8235292434692383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062123641371727, "step": 13390 }, { "epoch": 0.26784, "grad_norm": 1.9375, "grad_norm_var": 0.04222183227539063, "learning_rate": 0.0001, "loss": 3.8392, "loss/crossentropy": 1.733048439025879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20651907473802567, "step": 13392 }, { "epoch": 0.26788, "grad_norm": 1.9765625, "grad_norm_var": 0.043454742431640624, "learning_rate": 0.0001, "loss": 3.9004, "loss/crossentropy": 2.1339367628097534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21730080246925354, "step": 13394 }, { "epoch": 0.26792, "grad_norm": 1.890625, "grad_norm_var": 0.0452789306640625, "learning_rate": 0.0001, "loss": 3.8421, "loss/crossentropy": 1.885707974433899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20833835750818253, "step": 13396 }, { "epoch": 0.26796, "grad_norm": 1.890625, "grad_norm_var": 0.04429423014322917, "learning_rate": 0.0001, "loss": 3.9714, "loss/crossentropy": 1.9896257519721985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938292756676674, "step": 13398 }, { "epoch": 0.268, "grad_norm": 2.25, "grad_norm_var": 0.04457575480143229, "learning_rate": 0.0001, "loss": 4.2962, "loss/crossentropy": 2.0359573364257812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21572312712669373, "step": 13400 }, { "epoch": 0.26804, "grad_norm": 1.9296875, "grad_norm_var": 0.014095052083333334, "learning_rate": 0.0001, "loss": 4.3483, "loss/crossentropy": 2.241006851196289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2361859679222107, "step": 13402 }, { "epoch": 0.26808, "grad_norm": 1.96875, "grad_norm_var": 0.013852691650390625, "learning_rate": 0.0001, "loss": 4.0518, "loss/crossentropy": 2.0988917350769043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20220966637134552, "step": 13404 }, { "epoch": 0.26812, "grad_norm": 2.078125, "grad_norm_var": 0.014802805582682292, "learning_rate": 0.0001, "loss": 4.3664, "loss/crossentropy": 2.0803651213645935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22251859307289124, "step": 13406 }, { "epoch": 0.26816, "grad_norm": 2.046875, "grad_norm_var": 0.015827433268229166, "learning_rate": 0.0001, "loss": 4.0473, "loss/crossentropy": 1.9151242971420288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19792036712169647, "step": 13408 }, { "epoch": 0.2682, "grad_norm": 2.96875, "grad_norm_var": 0.07619196573893229, "learning_rate": 0.0001, "loss": 4.0321, "loss/crossentropy": 1.908549964427948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21568355709314346, "step": 13410 }, { "epoch": 0.26824, "grad_norm": 2.109375, "grad_norm_var": 0.06887919108072917, "learning_rate": 0.0001, "loss": 4.3844, "loss/crossentropy": 2.3226535320281982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2332221046090126, "step": 13412 }, { "epoch": 0.26828, "grad_norm": 1.9765625, "grad_norm_var": 0.06696548461914062, "learning_rate": 0.0001, "loss": 4.24, "loss/crossentropy": 1.9767839312553406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038547545671463, "step": 13414 }, { "epoch": 0.26832, "grad_norm": 1.859375, "grad_norm_var": 0.06800028483072916, "learning_rate": 0.0001, "loss": 3.9592, "loss/crossentropy": 2.2013003826141357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076154574751854, "step": 13416 }, { "epoch": 0.26836, "grad_norm": 1.9921875, "grad_norm_var": 0.06597671508789063, "learning_rate": 0.0001, "loss": 4.1128, "loss/crossentropy": 2.0941338539123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20538556575775146, "step": 13418 }, { "epoch": 0.2684, "grad_norm": 2.125, "grad_norm_var": 0.06494954427083334, "learning_rate": 0.0001, "loss": 4.3656, "loss/crossentropy": 2.0206560492515564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20901528000831604, "step": 13420 }, { "epoch": 0.26844, "grad_norm": 2.171875, "grad_norm_var": 0.06690648396809896, "learning_rate": 0.0001, "loss": 4.2606, "loss/crossentropy": 2.1317169070243835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21660470962524414, "step": 13422 }, { "epoch": 0.26848, "grad_norm": 1.8203125, "grad_norm_var": 0.07109273274739583, "learning_rate": 0.0001, "loss": 3.6026, "loss/crossentropy": 1.631429135799408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1729787141084671, "step": 13424 }, { "epoch": 0.26852, "grad_norm": 1.8203125, "grad_norm_var": 0.012491607666015625, "learning_rate": 0.0001, "loss": 3.923, "loss/crossentropy": 1.8428707122802734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18231894075870514, "step": 13426 }, { "epoch": 0.26856, "grad_norm": 1.984375, "grad_norm_var": 0.011104075113932292, "learning_rate": 0.0001, "loss": 4.2351, "loss/crossentropy": 1.860277235507965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18769516795873642, "step": 13428 }, { "epoch": 0.2686, "grad_norm": 1.890625, "grad_norm_var": 0.012542470296223959, "learning_rate": 0.0001, "loss": 4.0199, "loss/crossentropy": 2.02046799659729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160683423280716, "step": 13430 }, { "epoch": 0.26864, "grad_norm": 2.359375, "grad_norm_var": 0.021491495768229167, "learning_rate": 0.0001, "loss": 4.5105, "loss/crossentropy": 2.094564437866211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22155898064374924, "step": 13432 }, { "epoch": 0.26868, "grad_norm": 2.109375, "grad_norm_var": 0.02215550740559896, "learning_rate": 0.0001, "loss": 4.0382, "loss/crossentropy": 2.013366401195526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126912623643875, "step": 13434 }, { "epoch": 0.26872, "grad_norm": 2.015625, "grad_norm_var": 0.02212702433268229, "learning_rate": 0.0001, "loss": 4.5826, "loss/crossentropy": 2.360998272895813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355523630976677, "step": 13436 }, { "epoch": 0.26876, "grad_norm": 1.7890625, "grad_norm_var": 0.02289606730143229, "learning_rate": 0.0001, "loss": 3.7129, "loss/crossentropy": 1.754651427268982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18666986376047134, "step": 13438 }, { "epoch": 0.2688, "grad_norm": 2.03125, "grad_norm_var": 0.01812922159830729, "learning_rate": 0.0001, "loss": 4.1024, "loss/crossentropy": 1.9176424741744995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19241741299629211, "step": 13440 }, { "epoch": 0.26884, "grad_norm": 2.078125, "grad_norm_var": 0.0154449462890625, "learning_rate": 0.0001, "loss": 4.2548, "loss/crossentropy": 2.324304223060608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22302107512950897, "step": 13442 }, { "epoch": 0.26888, "grad_norm": 2.5, "grad_norm_var": 0.028758748372395834, "learning_rate": 0.0001, "loss": 4.0693, "loss/crossentropy": 1.8527624011039734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18644367158412933, "step": 13444 }, { "epoch": 0.26892, "grad_norm": 2.1875, "grad_norm_var": 0.027570597330729165, "learning_rate": 0.0001, "loss": 4.4437, "loss/crossentropy": 2.0999260544776917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21172572672367096, "step": 13446 }, { "epoch": 0.26896, "grad_norm": 1.90625, "grad_norm_var": 0.026627349853515624, "learning_rate": 0.0001, "loss": 3.8125, "loss/crossentropy": 1.8577081561088562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17302225530147552, "step": 13448 }, { "epoch": 0.269, "grad_norm": 1.921875, "grad_norm_var": 0.026956939697265626, "learning_rate": 0.0001, "loss": 4.2182, "loss/crossentropy": 2.11561119556427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20249811559915543, "step": 13450 }, { "epoch": 0.26904, "grad_norm": 1.921875, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.3036913871765137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20101947337388992, "step": 13452 }, { "epoch": 0.26908, "grad_norm": 2.078125, "grad_norm_var": 0.02341283162434896, "learning_rate": 0.0001, "loss": 3.9013, "loss/crossentropy": 2.0498175621032715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19935546070337296, "step": 13454 }, { "epoch": 0.26912, "grad_norm": 2.09375, "grad_norm_var": 0.08842137654622396, "learning_rate": 0.0001, "loss": 4.2601, "loss/crossentropy": 2.27053964138031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31231266260147095, "step": 13456 }, { "epoch": 0.26916, "grad_norm": 1.921875, "grad_norm_var": 0.0908953348795573, "learning_rate": 0.0001, "loss": 4.0035, "loss/crossentropy": 2.191486120223999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058379054069519, "step": 13458 }, { "epoch": 0.2692, "grad_norm": 2.015625, "grad_norm_var": 0.07780939737955729, "learning_rate": 0.0001, "loss": 4.1193, "loss/crossentropy": 2.008154332637787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20862725377082825, "step": 13460 }, { "epoch": 0.26924, "grad_norm": 2.109375, "grad_norm_var": 0.07779947916666667, "learning_rate": 0.0001, "loss": 4.3191, "loss/crossentropy": 2.3374814987182617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24534277617931366, "step": 13462 }, { "epoch": 0.26928, "grad_norm": 2.0625, "grad_norm_var": 0.07355855305989584, "learning_rate": 0.0001, "loss": 4.0558, "loss/crossentropy": 2.2654502391815186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21406329423189163, "step": 13464 }, { "epoch": 0.26932, "grad_norm": 2.21875, "grad_norm_var": 0.07412821451822917, "learning_rate": 0.0001, "loss": 4.2078, "loss/crossentropy": 2.1021856665611267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19653601199388504, "step": 13466 }, { "epoch": 0.26936, "grad_norm": 2.140625, "grad_norm_var": 0.0727068583170573, "learning_rate": 0.0001, "loss": 4.4466, "loss/crossentropy": 1.9482329487800598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20822357386350632, "step": 13468 }, { "epoch": 0.2694, "grad_norm": 1.8125, "grad_norm_var": 0.07814915974934895, "learning_rate": 0.0001, "loss": 4.1036, "loss/crossentropy": 2.122725486755371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132444903254509, "step": 13470 }, { "epoch": 0.26944, "grad_norm": 2.0, "grad_norm_var": 0.015044911702473959, "learning_rate": 0.0001, "loss": 4.0337, "loss/crossentropy": 1.760430932044983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1835266426205635, "step": 13472 }, { "epoch": 0.26948, "grad_norm": 2.046875, "grad_norm_var": 0.014847564697265624, "learning_rate": 0.0001, "loss": 4.0338, "loss/crossentropy": 1.610491931438446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19731693714857101, "step": 13474 }, { "epoch": 0.26952, "grad_norm": 2.03125, "grad_norm_var": 0.015313466389973959, "learning_rate": 0.0001, "loss": 4.213, "loss/crossentropy": 2.1966941356658936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22074176371097565, "step": 13476 }, { "epoch": 0.26956, "grad_norm": 1.8515625, "grad_norm_var": 0.029904937744140624, "learning_rate": 0.0001, "loss": 4.1291, "loss/crossentropy": 2.045714259147644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193753719329834, "step": 13478 }, { "epoch": 0.2696, "grad_norm": 2.0, "grad_norm_var": 0.029886881510416668, "learning_rate": 0.0001, "loss": 4.0127, "loss/crossentropy": 1.860603928565979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900448128581047, "step": 13480 }, { "epoch": 0.26964, "grad_norm": 2.125, "grad_norm_var": 0.028449503580729167, "learning_rate": 0.0001, "loss": 4.0643, "loss/crossentropy": 2.171591639518738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21908972412347794, "step": 13482 }, { "epoch": 0.26968, "grad_norm": 2.296875, "grad_norm_var": 0.030265299479166667, "learning_rate": 0.0001, "loss": 4.1351, "loss/crossentropy": 2.271330237388611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21823390573263168, "step": 13484 }, { "epoch": 0.26972, "grad_norm": 1.953125, "grad_norm_var": 0.026875813802083332, "learning_rate": 0.0001, "loss": 4.192, "loss/crossentropy": 2.155557870864868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20512574911117554, "step": 13486 }, { "epoch": 0.26976, "grad_norm": 1.9609375, "grad_norm_var": 0.027164459228515625, "learning_rate": 0.0001, "loss": 4.2561, "loss/crossentropy": 2.3827039003372192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22485698014497757, "step": 13488 }, { "epoch": 0.2698, "grad_norm": 1.9375, "grad_norm_var": 0.028636678059895834, "learning_rate": 0.0001, "loss": 3.9937, "loss/crossentropy": 2.225351929664612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22235903143882751, "step": 13490 }, { "epoch": 0.26984, "grad_norm": 2.015625, "grad_norm_var": 0.027717081705729167, "learning_rate": 0.0001, "loss": 4.0144, "loss/crossentropy": 2.1125447750091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20206717401742935, "step": 13492 }, { "epoch": 0.26988, "grad_norm": 1.9296875, "grad_norm_var": 0.013639068603515625, "learning_rate": 0.0001, "loss": 4.0318, "loss/crossentropy": 2.30017626285553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22321031987667084, "step": 13494 }, { "epoch": 0.26992, "grad_norm": 1.9609375, "grad_norm_var": 0.0145904541015625, "learning_rate": 0.0001, "loss": 4.3979, "loss/crossentropy": 2.416603446006775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2413351908326149, "step": 13496 }, { "epoch": 0.26996, "grad_norm": 2.046875, "grad_norm_var": 0.013185373942057292, "learning_rate": 0.0001, "loss": 4.0436, "loss/crossentropy": 2.06734299659729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23090296238660812, "step": 13498 }, { "epoch": 0.27, "grad_norm": 1.96875, "grad_norm_var": 0.0050961812337239586, "learning_rate": 0.0001, "loss": 4.0, "loss/crossentropy": 1.8088473677635193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1988995596766472, "step": 13500 }, { "epoch": 0.27004, "grad_norm": 2.015625, "grad_norm_var": 0.005092112223307291, "learning_rate": 0.0001, "loss": 4.0205, "loss/crossentropy": 1.7075524926185608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18463176488876343, "step": 13502 }, { "epoch": 0.27008, "grad_norm": 2.21875, "grad_norm_var": 0.0083160400390625, "learning_rate": 0.0001, "loss": 4.2291, "loss/crossentropy": 2.1184898018836975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20941460877656937, "step": 13504 }, { "epoch": 0.27012, "grad_norm": 1.890625, "grad_norm_var": 0.007736968994140625, "learning_rate": 0.0001, "loss": 4.0139, "loss/crossentropy": 1.8351567387580872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18913155794143677, "step": 13506 }, { "epoch": 0.27016, "grad_norm": 1.8515625, "grad_norm_var": 0.00914306640625, "learning_rate": 0.0001, "loss": 3.935, "loss/crossentropy": 2.090391516685486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21020452678203583, "step": 13508 }, { "epoch": 0.2702, "grad_norm": 2.046875, "grad_norm_var": 0.0078521728515625, "learning_rate": 0.0001, "loss": 4.1805, "loss/crossentropy": 2.0680218935012817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20998934656381607, "step": 13510 }, { "epoch": 0.27024, "grad_norm": 1.90625, "grad_norm_var": 0.007236480712890625, "learning_rate": 0.0001, "loss": 4.0074, "loss/crossentropy": 2.258071780204773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22634688764810562, "step": 13512 }, { "epoch": 0.27028, "grad_norm": 2.15625, "grad_norm_var": 0.03350397745768229, "learning_rate": 0.0001, "loss": 4.775, "loss/crossentropy": 1.931319773197174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.29265259206295013, "step": 13514 }, { "epoch": 0.27032, "grad_norm": 1.9296875, "grad_norm_var": 0.03398844401041667, "learning_rate": 0.0001, "loss": 4.3408, "loss/crossentropy": 2.45454478263855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22465243190526962, "step": 13516 }, { "epoch": 0.27036, "grad_norm": 2.046875, "grad_norm_var": 0.034063466389973956, "learning_rate": 0.0001, "loss": 4.2279, "loss/crossentropy": 2.005887746810913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19818827509880066, "step": 13518 }, { "epoch": 0.2704, "grad_norm": 2.03125, "grad_norm_var": 0.031107330322265626, "learning_rate": 0.0001, "loss": 4.318, "loss/crossentropy": 2.2329577207565308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21015001833438873, "step": 13520 }, { "epoch": 0.27044, "grad_norm": 1.8359375, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 4.1026, "loss/crossentropy": 2.1716688871383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20930524915456772, "step": 13522 }, { "epoch": 0.27048, "grad_norm": 1.9609375, "grad_norm_var": 0.030651601155598958, "learning_rate": 0.0001, "loss": 3.957, "loss/crossentropy": 1.659675419330597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963459849357605, "step": 13524 }, { "epoch": 0.27052, "grad_norm": 1.9140625, "grad_norm_var": 0.035359700520833336, "learning_rate": 0.0001, "loss": 4.2214, "loss/crossentropy": 1.9285706877708435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1708521470427513, "step": 13526 }, { "epoch": 0.27056, "grad_norm": 2.046875, "grad_norm_var": 0.03286031087239583, "learning_rate": 0.0001, "loss": 4.1771, "loss/crossentropy": 2.0428953170776367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20893662422895432, "step": 13528 }, { "epoch": 0.2706, "grad_norm": 2.078125, "grad_norm_var": 0.011655426025390625, "learning_rate": 0.0001, "loss": 3.9885, "loss/crossentropy": 1.979922592639923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19091727584600449, "step": 13530 }, { "epoch": 0.27064, "grad_norm": 1.984375, "grad_norm_var": 0.015922037760416667, "learning_rate": 0.0001, "loss": 4.3176, "loss/crossentropy": 2.1879321336746216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24813223630189896, "step": 13532 }, { "epoch": 0.27068, "grad_norm": 1.9921875, "grad_norm_var": 0.01631647745768229, "learning_rate": 0.0001, "loss": 4.1852, "loss/crossentropy": 2.0450875759124756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21059003472328186, "step": 13534 }, { "epoch": 0.27072, "grad_norm": 1.984375, "grad_norm_var": 0.01673151652018229, "learning_rate": 0.0001, "loss": 4.3549, "loss/crossentropy": 2.050394892692566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21235206723213196, "step": 13536 }, { "epoch": 0.27076, "grad_norm": 2.0, "grad_norm_var": 0.014793904622395833, "learning_rate": 0.0001, "loss": 4.0128, "loss/crossentropy": 2.0687233805656433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22125782817602158, "step": 13538 }, { "epoch": 0.2708, "grad_norm": 2.046875, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 4.1247, "loss/crossentropy": 1.8587198853492737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19718395173549652, "step": 13540 }, { "epoch": 0.27084, "grad_norm": 2.125, "grad_norm_var": 0.010223134358723959, "learning_rate": 0.0001, "loss": 4.2992, "loss/crossentropy": 2.2416387796401978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797060549259186, "step": 13542 }, { "epoch": 0.27088, "grad_norm": 4.8125, "grad_norm_var": 0.495751953125, "learning_rate": 0.0001, "loss": 4.1875, "loss/crossentropy": 2.251328468322754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23915010690689087, "step": 13544 }, { "epoch": 0.27092, "grad_norm": 2.125, "grad_norm_var": 0.4896074930826823, "learning_rate": 0.0001, "loss": 4.0601, "loss/crossentropy": 1.990403652191162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710015833377838, "step": 13546 }, { "epoch": 0.27096, "grad_norm": 2.09375, "grad_norm_var": 0.4871070861816406, "learning_rate": 0.0001, "loss": 4.38, "loss/crossentropy": 2.176861047744751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2153458073735237, "step": 13548 }, { "epoch": 0.271, "grad_norm": 1.953125, "grad_norm_var": 0.4867286682128906, "learning_rate": 0.0001, "loss": 4.0311, "loss/crossentropy": 1.581967830657959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787950098514557, "step": 13550 }, { "epoch": 0.27104, "grad_norm": 1.90625, "grad_norm_var": 0.4920183817545573, "learning_rate": 0.0001, "loss": 3.9803, "loss/crossentropy": 1.827072560787201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18847094476222992, "step": 13552 }, { "epoch": 0.27108, "grad_norm": 2.03125, "grad_norm_var": 0.49279683430989585, "learning_rate": 0.0001, "loss": 3.9381, "loss/crossentropy": 1.5748514533042908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18339257687330246, "step": 13554 }, { "epoch": 0.27112, "grad_norm": 2.125, "grad_norm_var": 0.4897989908854167, "learning_rate": 0.0001, "loss": 4.2545, "loss/crossentropy": 2.3187586069107056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535529136657715, "step": 13556 }, { "epoch": 0.27116, "grad_norm": 2.15625, "grad_norm_var": 0.4885660807291667, "learning_rate": 0.0001, "loss": 4.4848, "loss/crossentropy": 2.3675668239593506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23080945760011673, "step": 13558 }, { "epoch": 0.2712, "grad_norm": 2.0625, "grad_norm_var": 0.0066912333170572914, "learning_rate": 0.0001, "loss": 4.2032, "loss/crossentropy": 2.3976542949676514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2418106645345688, "step": 13560 }, { "epoch": 0.27124, "grad_norm": 1.984375, "grad_norm_var": 0.006221262613932291, "learning_rate": 0.0001, "loss": 4.1877, "loss/crossentropy": 2.1341328024864197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21845046430826187, "step": 13562 }, { "epoch": 0.27128, "grad_norm": 2.046875, "grad_norm_var": 0.006154123942057292, "learning_rate": 0.0001, "loss": 4.3067, "loss/crossentropy": 2.0519689321517944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21991144120693207, "step": 13564 }, { "epoch": 0.27132, "grad_norm": 2.0, "grad_norm_var": 0.005812327067057292, "learning_rate": 0.0001, "loss": 4.106, "loss/crossentropy": 1.9690070748329163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20945511013269424, "step": 13566 }, { "epoch": 0.27136, "grad_norm": 2.09375, "grad_norm_var": 0.004609934488932292, "learning_rate": 0.0001, "loss": 4.0929, "loss/crossentropy": 1.9553492069244385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299523577094078, "step": 13568 }, { "epoch": 0.2714, "grad_norm": 2.25, "grad_norm_var": 0.0055501302083333336, "learning_rate": 0.0001, "loss": 4.158, "loss/crossentropy": 1.8214278817176819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1801338642835617, "step": 13570 }, { "epoch": 0.27144, "grad_norm": 1.7578125, "grad_norm_var": 0.011822255452473958, "learning_rate": 0.0001, "loss": 3.8658, "loss/crossentropy": 1.7834638953208923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18421506136655807, "step": 13572 }, { "epoch": 0.27148, "grad_norm": 1.984375, "grad_norm_var": 0.013097890218098958, "learning_rate": 0.0001, "loss": 3.8404, "loss/crossentropy": 1.7703429460525513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18263264745473862, "step": 13574 }, { "epoch": 0.27152, "grad_norm": 2.21875, "grad_norm_var": 0.0155181884765625, "learning_rate": 0.0001, "loss": 4.1489, "loss/crossentropy": 2.228934168815613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21034922450780869, "step": 13576 }, { "epoch": 0.27156, "grad_norm": 2.046875, "grad_norm_var": 0.015379842122395833, "learning_rate": 0.0001, "loss": 4.4665, "loss/crossentropy": 2.160655975341797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22300932556390762, "step": 13578 }, { "epoch": 0.2716, "grad_norm": 2.078125, "grad_norm_var": 0.0150634765625, "learning_rate": 0.0001, "loss": 4.2042, "loss/crossentropy": 2.1582624912261963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21843481063842773, "step": 13580 }, { "epoch": 0.27164, "grad_norm": 2.03125, "grad_norm_var": 0.0150146484375, "learning_rate": 0.0001, "loss": 4.2513, "loss/crossentropy": 2.0660162568092346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1985679790377617, "step": 13582 }, { "epoch": 0.27168, "grad_norm": 2.0625, "grad_norm_var": 0.015433502197265626, "learning_rate": 0.0001, "loss": 3.873, "loss/crossentropy": 1.839695692062378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772506907582283, "step": 13584 }, { "epoch": 0.27172, "grad_norm": 2.015625, "grad_norm_var": 0.012345123291015624, "learning_rate": 0.0001, "loss": 3.9649, "loss/crossentropy": 2.0837132930755615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19461007416248322, "step": 13586 }, { "epoch": 0.27176, "grad_norm": 2.09375, "grad_norm_var": 0.00750732421875, "learning_rate": 0.0001, "loss": 4.1986, "loss/crossentropy": 1.910473346710205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20272061973810196, "step": 13588 }, { "epoch": 0.2718, "grad_norm": 2.921875, "grad_norm_var": 0.054402669270833336, "learning_rate": 0.0001, "loss": 4.2259, "loss/crossentropy": 2.2427467107772827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21372541785240173, "step": 13590 }, { "epoch": 0.27184, "grad_norm": 2.0625, "grad_norm_var": 0.055272420247395836, "learning_rate": 0.0001, "loss": 4.1123, "loss/crossentropy": 2.165425181388855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2552504763007164, "step": 13592 }, { "epoch": 0.27188, "grad_norm": 1.96875, "grad_norm_var": 0.05620930989583333, "learning_rate": 0.0001, "loss": 4.0993, "loss/crossentropy": 1.9684009552001953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19894393533468246, "step": 13594 }, { "epoch": 0.27192, "grad_norm": 2.046875, "grad_norm_var": 0.055863444010416666, "learning_rate": 0.0001, "loss": 4.2421, "loss/crossentropy": 2.0572606325149536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20954644680023193, "step": 13596 }, { "epoch": 0.27196, "grad_norm": 1.78125, "grad_norm_var": 0.0611724853515625, "learning_rate": 0.0001, "loss": 3.5511, "loss/crossentropy": 1.8215171694755554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905388981103897, "step": 13598 }, { "epoch": 0.272, "grad_norm": 2.28125, "grad_norm_var": 0.06299209594726562, "learning_rate": 0.0001, "loss": 4.2407, "loss/crossentropy": 2.114215850830078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21045731008052826, "step": 13600 }, { "epoch": 0.27204, "grad_norm": 1.9921875, "grad_norm_var": 0.06243057250976562, "learning_rate": 0.0001, "loss": 4.1409, "loss/crossentropy": 2.301407814025879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148626372218132, "step": 13602 }, { "epoch": 0.27208, "grad_norm": 1.9453125, "grad_norm_var": 0.06350809733072917, "learning_rate": 0.0001, "loss": 4.4367, "loss/crossentropy": 2.0508424639701843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21292225271463394, "step": 13604 }, { "epoch": 0.27212, "grad_norm": 2.1875, "grad_norm_var": 0.014826456705729166, "learning_rate": 0.0001, "loss": 4.0024, "loss/crossentropy": 2.095071792602539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23258864879608154, "step": 13606 }, { "epoch": 0.27216, "grad_norm": 1.9765625, "grad_norm_var": 0.013802083333333333, "learning_rate": 0.0001, "loss": 4.1722, "loss/crossentropy": 2.3613405227661133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20384693890810013, "step": 13608 }, { "epoch": 0.2722, "grad_norm": 2.0625, "grad_norm_var": 0.0142578125, "learning_rate": 0.0001, "loss": 4.1397, "loss/crossentropy": 2.1618664264678955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21691302955150604, "step": 13610 }, { "epoch": 0.27224, "grad_norm": 2.03125, "grad_norm_var": 0.015620676676432292, "learning_rate": 0.0001, "loss": 3.8335, "loss/crossentropy": 1.6352717280387878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18613358587026596, "step": 13612 }, { "epoch": 0.27228, "grad_norm": 2.15625, "grad_norm_var": 0.012851715087890625, "learning_rate": 0.0001, "loss": 4.2405, "loss/crossentropy": 2.304627537727356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21715758740901947, "step": 13614 }, { "epoch": 0.27232, "grad_norm": 2.078125, "grad_norm_var": 0.009357706705729166, "learning_rate": 0.0001, "loss": 4.1845, "loss/crossentropy": 2.232232451438904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20997462421655655, "step": 13616 }, { "epoch": 0.27236, "grad_norm": 2.109375, "grad_norm_var": 0.011503092447916667, "learning_rate": 0.0001, "loss": 4.1273, "loss/crossentropy": 2.0350120663642883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20753738284111023, "step": 13618 }, { "epoch": 0.2724, "grad_norm": 1.8828125, "grad_norm_var": 0.015746053059895834, "learning_rate": 0.0001, "loss": 4.1955, "loss/crossentropy": 1.892416536808014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920388638973236, "step": 13620 }, { "epoch": 0.27244, "grad_norm": 2.015625, "grad_norm_var": 0.0128814697265625, "learning_rate": 0.0001, "loss": 4.3283, "loss/crossentropy": 2.1694198846817017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217724971473217, "step": 13622 }, { "epoch": 0.27248, "grad_norm": 1.96875, "grad_norm_var": 0.013444010416666667, "learning_rate": 0.0001, "loss": 3.9786, "loss/crossentropy": 2.1418001651763916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016705498099327, "step": 13624 }, { "epoch": 0.27252, "grad_norm": 2.171875, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 4.3992, "loss/crossentropy": 2.32223117351532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616849303245544, "step": 13626 }, { "epoch": 0.27256, "grad_norm": 2.03125, "grad_norm_var": 0.013044230143229167, "learning_rate": 0.0001, "loss": 4.1847, "loss/crossentropy": 1.9991823434829712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077011615037918, "step": 13628 }, { "epoch": 0.2726, "grad_norm": 1.8984375, "grad_norm_var": 0.012434641520182291, "learning_rate": 0.0001, "loss": 4.1331, "loss/crossentropy": 2.1381043195724487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748750865459442, "step": 13630 }, { "epoch": 0.27264, "grad_norm": 2.015625, "grad_norm_var": 0.012303670247395834, "learning_rate": 0.0001, "loss": 4.2215, "loss/crossentropy": 2.018588602542877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20897169411182404, "step": 13632 }, { "epoch": 0.27268, "grad_norm": 2.15625, "grad_norm_var": 0.01114501953125, "learning_rate": 0.0001, "loss": 4.3528, "loss/crossentropy": 2.1859233379364014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21695572137832642, "step": 13634 }, { "epoch": 0.27272, "grad_norm": 2.015625, "grad_norm_var": 0.0060198465983072914, "learning_rate": 0.0001, "loss": 4.2106, "loss/crossentropy": 2.17002010345459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21605493128299713, "step": 13636 }, { "epoch": 0.27276, "grad_norm": 2.046875, "grad_norm_var": 0.006058502197265625, "learning_rate": 0.0001, "loss": 4.261, "loss/crossentropy": 2.186649441719055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172158733010292, "step": 13638 }, { "epoch": 0.2728, "grad_norm": 1.9296875, "grad_norm_var": 0.007306925455729167, "learning_rate": 0.0001, "loss": 3.8394, "loss/crossentropy": 1.7324808835983276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17334696650505066, "step": 13640 }, { "epoch": 0.27284, "grad_norm": 2.03125, "grad_norm_var": 0.0049550374348958336, "learning_rate": 0.0001, "loss": 3.9805, "loss/crossentropy": 1.7768054008483887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1750154346227646, "step": 13642 }, { "epoch": 0.27288, "grad_norm": 1.9765625, "grad_norm_var": 0.0047515869140625, "learning_rate": 0.0001, "loss": 4.2292, "loss/crossentropy": 1.9629738330841064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19489944726228714, "step": 13644 }, { "epoch": 0.27292, "grad_norm": 2.109375, "grad_norm_var": 0.006882476806640625, "learning_rate": 0.0001, "loss": 4.4096, "loss/crossentropy": 2.144750416278839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19367601722478867, "step": 13646 }, { "epoch": 0.27296, "grad_norm": 2.03125, "grad_norm_var": 0.0067291259765625, "learning_rate": 0.0001, "loss": 4.1301, "loss/crossentropy": 1.9281827211380005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19894402474164963, "step": 13648 }, { "epoch": 0.273, "grad_norm": 1.8984375, "grad_norm_var": 0.007132720947265625, "learning_rate": 0.0001, "loss": 4.1998, "loss/crossentropy": 1.8092535138130188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18536384403705597, "step": 13650 }, { "epoch": 0.27304, "grad_norm": 2.046875, "grad_norm_var": 0.007726796468098958, "learning_rate": 0.0001, "loss": 4.1697, "loss/crossentropy": 2.5707184076309204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23067744076251984, "step": 13652 }, { "epoch": 0.27308, "grad_norm": 2.03125, "grad_norm_var": 0.008434804280598958, "learning_rate": 0.0001, "loss": 4.0189, "loss/crossentropy": 2.108555316925049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052021473646164, "step": 13654 }, { "epoch": 0.27312, "grad_norm": 1.984375, "grad_norm_var": 0.007228342692057291, "learning_rate": 0.0001, "loss": 4.3412, "loss/crossentropy": 1.836738109588623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20734501630067825, "step": 13656 }, { "epoch": 0.27316, "grad_norm": 1.9609375, "grad_norm_var": 0.007990519205729166, "learning_rate": 0.0001, "loss": 4.0783, "loss/crossentropy": 2.223781406879425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20618071407079697, "step": 13658 }, { "epoch": 0.2732, "grad_norm": 2.0, "grad_norm_var": 0.007207997639973958, "learning_rate": 0.0001, "loss": 4.1722, "loss/crossentropy": 2.028991222381592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20579702407121658, "step": 13660 }, { "epoch": 0.27324, "grad_norm": 2.015625, "grad_norm_var": 0.006915028889973958, "learning_rate": 0.0001, "loss": 4.0294, "loss/crossentropy": 2.1003119349479675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21900994330644608, "step": 13662 }, { "epoch": 0.27328, "grad_norm": 1.9453125, "grad_norm_var": 0.007289377848307291, "learning_rate": 0.0001, "loss": 4.1695, "loss/crossentropy": 2.1813069581985474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117675319314003, "step": 13664 }, { "epoch": 0.27332, "grad_norm": 2.140625, "grad_norm_var": 0.0105621337890625, "learning_rate": 0.0001, "loss": 3.8666, "loss/crossentropy": 2.2203832864761353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20088640600442886, "step": 13666 }, { "epoch": 0.27336, "grad_norm": 1.90625, "grad_norm_var": 0.0103759765625, "learning_rate": 0.0001, "loss": 4.1084, "loss/crossentropy": 1.8810867071151733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20646882802248, "step": 13668 }, { "epoch": 0.2734, "grad_norm": 1.9609375, "grad_norm_var": 0.010060373942057292, "learning_rate": 0.0001, "loss": 4.0746, "loss/crossentropy": 2.2147200107574463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038613259792328, "step": 13670 }, { "epoch": 0.27344, "grad_norm": 2.078125, "grad_norm_var": 0.008408355712890624, "learning_rate": 0.0001, "loss": 4.2151, "loss/crossentropy": 2.3654199838638306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22163932025432587, "step": 13672 }, { "epoch": 0.27348, "grad_norm": 1.9375, "grad_norm_var": 0.0086669921875, "learning_rate": 0.0001, "loss": 4.1584, "loss/crossentropy": 2.422420859336853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23172692954540253, "step": 13674 }, { "epoch": 0.27352, "grad_norm": 2.015625, "grad_norm_var": 0.008634440104166667, "learning_rate": 0.0001, "loss": 4.391, "loss/crossentropy": 2.0776702165603638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2030162662267685, "step": 13676 }, { "epoch": 0.27356, "grad_norm": 2.125, "grad_norm_var": 0.009611002604166667, "learning_rate": 0.0001, "loss": 3.9108, "loss/crossentropy": 1.7956212162971497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932467818260193, "step": 13678 }, { "epoch": 0.2736, "grad_norm": 2.046875, "grad_norm_var": 0.010573069254557291, "learning_rate": 0.0001, "loss": 4.1768, "loss/crossentropy": 2.176727533340454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20450271666049957, "step": 13680 }, { "epoch": 0.27364, "grad_norm": 1.90625, "grad_norm_var": 0.0062255859375, "learning_rate": 0.0001, "loss": 4.061, "loss/crossentropy": 2.0534290075302124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21150045841932297, "step": 13682 }, { "epoch": 0.27368, "grad_norm": 2.015625, "grad_norm_var": 0.0061920166015625, "learning_rate": 0.0001, "loss": 4.1682, "loss/crossentropy": 1.726151466369629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19256682693958282, "step": 13684 }, { "epoch": 0.27372, "grad_norm": 2.09375, "grad_norm_var": 0.005956013997395833, "learning_rate": 0.0001, "loss": 4.2666, "loss/crossentropy": 2.2024354934692383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222209945321083, "step": 13686 }, { "epoch": 0.27376, "grad_norm": 2.171875, "grad_norm_var": 0.007347615559895834, "learning_rate": 0.0001, "loss": 4.3252, "loss/crossentropy": 2.095518469810486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2092512547969818, "step": 13688 }, { "epoch": 0.2738, "grad_norm": 2.078125, "grad_norm_var": 0.0074460347493489586, "learning_rate": 0.0001, "loss": 4.0389, "loss/crossentropy": 1.9978103637695312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20576970279216766, "step": 13690 }, { "epoch": 0.27384, "grad_norm": 1.875, "grad_norm_var": 0.008599599202473959, "learning_rate": 0.0001, "loss": 4.1023, "loss/crossentropy": 2.233831286430359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20514021068811417, "step": 13692 }, { "epoch": 0.27388, "grad_norm": 1.921875, "grad_norm_var": 0.010251617431640625, "learning_rate": 0.0001, "loss": 4.0479, "loss/crossentropy": 1.6729156970977783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18054980039596558, "step": 13694 }, { "epoch": 0.27392, "grad_norm": 2.140625, "grad_norm_var": 0.010910797119140624, "learning_rate": 0.0001, "loss": 4.1911, "loss/crossentropy": 2.0113388895988464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21645111590623856, "step": 13696 }, { "epoch": 0.27396, "grad_norm": 2.140625, "grad_norm_var": 0.010469563802083333, "learning_rate": 0.0001, "loss": 4.3487, "loss/crossentropy": 1.9311461448669434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23587358742952347, "step": 13698 }, { "epoch": 0.274, "grad_norm": 1.9453125, "grad_norm_var": 0.01889012654622396, "learning_rate": 0.0001, "loss": 4.1099, "loss/crossentropy": 2.1205984354019165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20603877305984497, "step": 13700 }, { "epoch": 0.27404, "grad_norm": 1.9296875, "grad_norm_var": 0.01889012654622396, "learning_rate": 0.0001, "loss": 4.2947, "loss/crossentropy": 2.1700201630592346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21855031698942184, "step": 13702 }, { "epoch": 0.27408, "grad_norm": 1.8828125, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 3.7717, "loss/crossentropy": 1.5643411874771118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.161862351000309, "step": 13704 }, { "epoch": 0.27412, "grad_norm": 1.8671875, "grad_norm_var": 0.024881998697916668, "learning_rate": 0.0001, "loss": 3.7292, "loss/crossentropy": 2.032666802406311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20749235153198242, "step": 13706 }, { "epoch": 0.27416, "grad_norm": 1.890625, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.0899, "loss/crossentropy": 1.9313859939575195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19065556675195694, "step": 13708 }, { "epoch": 0.2742, "grad_norm": 2.0, "grad_norm_var": 0.02225519816080729, "learning_rate": 0.0001, "loss": 4.3295, "loss/crossentropy": 2.1200226545333862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1967960000038147, "step": 13710 }, { "epoch": 0.27424, "grad_norm": 1.953125, "grad_norm_var": 0.020765940348307293, "learning_rate": 0.0001, "loss": 4.0655, "loss/crossentropy": 2.144526958465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20019693672657013, "step": 13712 }, { "epoch": 0.27428, "grad_norm": 1.9609375, "grad_norm_var": 0.01809666951497396, "learning_rate": 0.0001, "loss": 4.2039, "loss/crossentropy": 1.9802407622337341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20123689621686935, "step": 13714 }, { "epoch": 0.27432, "grad_norm": 1.9453125, "grad_norm_var": 0.0062334696451822914, "learning_rate": 0.0001, "loss": 3.9415, "loss/crossentropy": 2.096527099609375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20282836258411407, "step": 13716 }, { "epoch": 0.27436, "grad_norm": 2.046875, "grad_norm_var": 0.005558013916015625, "learning_rate": 0.0001, "loss": 4.1829, "loss/crossentropy": 2.1037757992744446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039530873298645, "step": 13718 }, { "epoch": 0.2744, "grad_norm": 1.9609375, "grad_norm_var": 0.004752349853515625, "learning_rate": 0.0001, "loss": 3.8416, "loss/crossentropy": 1.5415751934051514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18478797376155853, "step": 13720 }, { "epoch": 0.27444, "grad_norm": 2.015625, "grad_norm_var": 0.001859283447265625, "learning_rate": 0.0001, "loss": 4.071, "loss/crossentropy": 2.0357913970947266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20176240801811218, "step": 13722 }, { "epoch": 0.27448, "grad_norm": 1.8125, "grad_norm_var": 0.0034543355305989582, "learning_rate": 0.0001, "loss": 3.8472, "loss/crossentropy": 1.9013367891311646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012690126895905, "step": 13724 }, { "epoch": 0.27452, "grad_norm": 1.9921875, "grad_norm_var": 0.00343017578125, "learning_rate": 0.0001, "loss": 3.8072, "loss/crossentropy": 1.5798682570457458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18169572949409485, "step": 13726 }, { "epoch": 0.27456, "grad_norm": 1.984375, "grad_norm_var": 0.0034624735514322915, "learning_rate": 0.0001, "loss": 4.163, "loss/crossentropy": 2.196288228034973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22311393916606903, "step": 13728 }, { "epoch": 0.2746, "grad_norm": 1.96875, "grad_norm_var": 0.003928375244140625, "learning_rate": 0.0001, "loss": 4.0172, "loss/crossentropy": 1.714774489402771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17624646425247192, "step": 13730 }, { "epoch": 0.27464, "grad_norm": 2.15625, "grad_norm_var": 0.0053484598795572914, "learning_rate": 0.0001, "loss": 4.3264, "loss/crossentropy": 1.9853646159172058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18827050924301147, "step": 13732 }, { "epoch": 0.27468, "grad_norm": 1.7890625, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 4.0074, "loss/crossentropy": 1.7830212116241455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1801598072052002, "step": 13734 }, { "epoch": 0.27472, "grad_norm": 1.8984375, "grad_norm_var": 0.008046213785807292, "learning_rate": 0.0001, "loss": 3.7923, "loss/crossentropy": 1.7400763034820557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930072009563446, "step": 13736 }, { "epoch": 0.27476, "grad_norm": 2.046875, "grad_norm_var": 0.008421834309895833, "learning_rate": 0.0001, "loss": 4.0992, "loss/crossentropy": 2.2292455434799194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20573973655700684, "step": 13738 }, { "epoch": 0.2748, "grad_norm": 2.09375, "grad_norm_var": 0.009056599934895833, "learning_rate": 0.0001, "loss": 4.4305, "loss/crossentropy": 2.1399097442626953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265445813536644, "step": 13740 }, { "epoch": 0.27484, "grad_norm": 1.953125, "grad_norm_var": 0.009366861979166667, "learning_rate": 0.0001, "loss": 4.0596, "loss/crossentropy": 2.049125075340271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20623435080051422, "step": 13742 }, { "epoch": 0.27488, "grad_norm": 2.078125, "grad_norm_var": 0.010689036051432291, "learning_rate": 0.0001, "loss": 4.2678, "loss/crossentropy": 2.388680577278137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23650053143501282, "step": 13744 }, { "epoch": 0.27492, "grad_norm": 2.390625, "grad_norm_var": 0.019577789306640624, "learning_rate": 0.0001, "loss": 4.5091, "loss/crossentropy": 2.05421245098114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19932237267494202, "step": 13746 }, { "epoch": 0.27496, "grad_norm": 2.0, "grad_norm_var": 0.01969172159830729, "learning_rate": 0.0001, "loss": 3.8274, "loss/crossentropy": 1.892092227935791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1907898187637329, "step": 13748 }, { "epoch": 0.275, "grad_norm": 2.109375, "grad_norm_var": 0.018507639567057293, "learning_rate": 0.0001, "loss": 4.2196, "loss/crossentropy": 2.160037875175476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24229361861944199, "step": 13750 }, { "epoch": 0.27504, "grad_norm": 1.8984375, "grad_norm_var": 0.017451985677083334, "learning_rate": 0.0001, "loss": 4.1278, "loss/crossentropy": 2.5185710191726685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2394627332687378, "step": 13752 }, { "epoch": 0.27508, "grad_norm": 1.953125, "grad_norm_var": 0.01754735310872396, "learning_rate": 0.0001, "loss": 4.1551, "loss/crossentropy": 2.0613635778427124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19758722186088562, "step": 13754 }, { "epoch": 0.27512, "grad_norm": 2.046875, "grad_norm_var": 0.019496409098307292, "learning_rate": 0.0001, "loss": 4.1905, "loss/crossentropy": 1.9780999422073364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773972362279892, "step": 13756 }, { "epoch": 0.27516, "grad_norm": 2.1875, "grad_norm_var": 0.01987482706705729, "learning_rate": 0.0001, "loss": 4.3297, "loss/crossentropy": 2.1116772890090942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20809265226125717, "step": 13758 }, { "epoch": 0.2752, "grad_norm": 2.109375, "grad_norm_var": 0.021201324462890626, "learning_rate": 0.0001, "loss": 4.2079, "loss/crossentropy": 2.1072784662246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21189533174037933, "step": 13760 }, { "epoch": 0.27524, "grad_norm": 1.953125, "grad_norm_var": 0.014902496337890625, "learning_rate": 0.0001, "loss": 3.9102, "loss/crossentropy": 2.261967897415161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058071494102478, "step": 13762 }, { "epoch": 0.27528, "grad_norm": 2.28125, "grad_norm_var": 0.01571044921875, "learning_rate": 0.0001, "loss": 4.0684, "loss/crossentropy": 2.240622043609619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2305934578180313, "step": 13764 }, { "epoch": 0.27532, "grad_norm": 2.078125, "grad_norm_var": 0.013874308268229166, "learning_rate": 0.0001, "loss": 3.8268, "loss/crossentropy": 1.6203233003616333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17164954543113708, "step": 13766 }, { "epoch": 0.27536, "grad_norm": 1.8984375, "grad_norm_var": 0.013521067301432292, "learning_rate": 0.0001, "loss": 3.928, "loss/crossentropy": 2.171602725982666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20083081722259521, "step": 13768 }, { "epoch": 0.2754, "grad_norm": 2.046875, "grad_norm_var": 0.012645467122395834, "learning_rate": 0.0001, "loss": 4.2738, "loss/crossentropy": 2.2198195457458496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21113458275794983, "step": 13770 }, { "epoch": 0.27544, "grad_norm": 1.9375, "grad_norm_var": 0.012296549479166667, "learning_rate": 0.0001, "loss": 4.2833, "loss/crossentropy": 2.1037912368774414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21217171102762222, "step": 13772 }, { "epoch": 0.27548, "grad_norm": 1.9765625, "grad_norm_var": 0.012015533447265626, "learning_rate": 0.0001, "loss": 3.9824, "loss/crossentropy": 2.252090096473694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20365406572818756, "step": 13774 }, { "epoch": 0.27552, "grad_norm": 1.8828125, "grad_norm_var": 0.011701456705729167, "learning_rate": 0.0001, "loss": 3.9513, "loss/crossentropy": 1.8044906258583069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854797750711441, "step": 13776 }, { "epoch": 0.27556, "grad_norm": 2.09375, "grad_norm_var": 0.012284342447916667, "learning_rate": 0.0001, "loss": 4.5229, "loss/crossentropy": 2.2397952675819397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214948832988739, "step": 13778 }, { "epoch": 0.2756, "grad_norm": 2.34375, "grad_norm_var": 0.0164215087890625, "learning_rate": 0.0001, "loss": 4.3569, "loss/crossentropy": 1.776510238647461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19995201379060745, "step": 13780 }, { "epoch": 0.27564, "grad_norm": 4.0625, "grad_norm_var": 0.26768290201822914, "learning_rate": 0.0001, "loss": 4.3061, "loss/crossentropy": 2.470343589782715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194378450512886, "step": 13782 }, { "epoch": 0.27568, "grad_norm": 1.9921875, "grad_norm_var": 0.26253433227539064, "learning_rate": 0.0001, "loss": 4.0708, "loss/crossentropy": 2.1340490579605103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20280296355485916, "step": 13784 }, { "epoch": 0.27572, "grad_norm": 2.03125, "grad_norm_var": 0.26368179321289065, "learning_rate": 0.0001, "loss": 4.1072, "loss/crossentropy": 1.6801128387451172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16750500351190567, "step": 13786 }, { "epoch": 0.27576, "grad_norm": 1.9609375, "grad_norm_var": 0.26387939453125, "learning_rate": 0.0001, "loss": 3.9902, "loss/crossentropy": 1.9632562398910522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19115211814641953, "step": 13788 }, { "epoch": 0.2758, "grad_norm": 2.125, "grad_norm_var": 0.25916519165039065, "learning_rate": 0.0001, "loss": 4.2139, "loss/crossentropy": 2.0817149877548218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21430402994155884, "step": 13790 }, { "epoch": 0.27584, "grad_norm": 1.9296875, "grad_norm_var": 0.253088124593099, "learning_rate": 0.0001, "loss": 4.3244, "loss/crossentropy": 2.2736687660217285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22024155408143997, "step": 13792 }, { "epoch": 0.27588, "grad_norm": 1.9921875, "grad_norm_var": 0.25625712076822915, "learning_rate": 0.0001, "loss": 4.2548, "loss/crossentropy": 2.2632944583892822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210355743765831, "step": 13794 }, { "epoch": 0.27592, "grad_norm": 1.953125, "grad_norm_var": 0.26166966756184895, "learning_rate": 0.0001, "loss": 4.2799, "loss/crossentropy": 2.5457879304885864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22530706226825714, "step": 13796 }, { "epoch": 0.27596, "grad_norm": 1.859375, "grad_norm_var": 0.008410390218098958, "learning_rate": 0.0001, "loss": 3.8969, "loss/crossentropy": 2.0878910422325134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19901857525110245, "step": 13798 }, { "epoch": 0.276, "grad_norm": 2.015625, "grad_norm_var": 0.008348592122395833, "learning_rate": 0.0001, "loss": 4.3213, "loss/crossentropy": 1.803489863872528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19378191232681274, "step": 13800 }, { "epoch": 0.27604, "grad_norm": 2.0, "grad_norm_var": 0.009596506754557291, "learning_rate": 0.0001, "loss": 4.1916, "loss/crossentropy": 2.156951904296875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091423124074936, "step": 13802 }, { "epoch": 0.27608, "grad_norm": 1.890625, "grad_norm_var": 0.010493977864583334, "learning_rate": 0.0001, "loss": 3.9346, "loss/crossentropy": 1.7714723944664001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20779500901699066, "step": 13804 }, { "epoch": 0.27612, "grad_norm": 1.8984375, "grad_norm_var": 0.009590403238932291, "learning_rate": 0.0001, "loss": 4.0499, "loss/crossentropy": 1.8808923363685608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19429123401641846, "step": 13806 }, { "epoch": 0.27616, "grad_norm": 1.9375, "grad_norm_var": 0.005516560872395834, "learning_rate": 0.0001, "loss": 3.706, "loss/crossentropy": 1.6658846735954285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17177510261535645, "step": 13808 }, { "epoch": 0.2762, "grad_norm": 1.8359375, "grad_norm_var": 0.018697102864583332, "learning_rate": 0.0001, "loss": 4.0909, "loss/crossentropy": 2.052769422531128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20889723300933838, "step": 13810 }, { "epoch": 0.27624, "grad_norm": 2.015625, "grad_norm_var": 0.18883031209309895, "learning_rate": 0.0001, "loss": 4.0225, "loss/crossentropy": 1.5800148844718933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24654322117567062, "step": 13812 }, { "epoch": 0.27628, "grad_norm": 2.140625, "grad_norm_var": 0.5619504292805989, "learning_rate": 0.0001, "loss": 4.1059, "loss/crossentropy": 1.836085557937622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19563084840774536, "step": 13814 }, { "epoch": 0.27632, "grad_norm": 2.046875, "grad_norm_var": 0.5601722717285156, "learning_rate": 0.0001, "loss": 4.0077, "loss/crossentropy": 1.627321183681488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17698387801647186, "step": 13816 }, { "epoch": 0.27636, "grad_norm": 2.046875, "grad_norm_var": 0.55164794921875, "learning_rate": 0.0001, "loss": 4.2038, "loss/crossentropy": 2.0546024441719055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19832595437765121, "step": 13818 }, { "epoch": 0.2764, "grad_norm": 1.96875, "grad_norm_var": 0.5499501546223958, "learning_rate": 0.0001, "loss": 4.064, "loss/crossentropy": 2.112219452857971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19605965912342072, "step": 13820 }, { "epoch": 0.27644, "grad_norm": 2.078125, "grad_norm_var": 0.5405006408691406, "learning_rate": 0.0001, "loss": 4.3825, "loss/crossentropy": 1.9582284688949585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015855312347412, "step": 13822 }, { "epoch": 0.27648, "grad_norm": 1.953125, "grad_norm_var": 0.5307573954264323, "learning_rate": 0.0001, "loss": 4.2699, "loss/crossentropy": 2.54653799533844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23546704649925232, "step": 13824 }, { "epoch": 0.27652, "grad_norm": 1.953125, "grad_norm_var": 0.5336090087890625, "learning_rate": 0.0001, "loss": 4.0729, "loss/crossentropy": 1.630593478679657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16478916257619858, "step": 13826 }, { "epoch": 0.27656, "grad_norm": 2.046875, "grad_norm_var": 0.40966796875, "learning_rate": 0.0001, "loss": 4.0123, "loss/crossentropy": 1.7995671033859253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19886638224124908, "step": 13828 }, { "epoch": 0.2766, "grad_norm": 2.21875, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 4.4547, "loss/crossentropy": 2.5423338413238525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.28057335317134857, "step": 13830 }, { "epoch": 0.27664, "grad_norm": 2.078125, "grad_norm_var": 0.018631744384765624, "learning_rate": 0.0001, "loss": 4.0213, "loss/crossentropy": 1.8524783849716187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20141787081956863, "step": 13832 }, { "epoch": 0.27668, "grad_norm": 1.9453125, "grad_norm_var": 0.019440714518229166, "learning_rate": 0.0001, "loss": 4.2155, "loss/crossentropy": 2.242451548576355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003998070955276, "step": 13834 }, { "epoch": 0.27672, "grad_norm": 2.046875, "grad_norm_var": 0.0192047119140625, "learning_rate": 0.0001, "loss": 4.2665, "loss/crossentropy": 2.2434345483779907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21679577976465225, "step": 13836 }, { "epoch": 0.27676, "grad_norm": 2.0, "grad_norm_var": 0.019551595052083332, "learning_rate": 0.0001, "loss": 4.014, "loss/crossentropy": 1.6854392290115356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19326400756835938, "step": 13838 }, { "epoch": 0.2768, "grad_norm": 2.171875, "grad_norm_var": 0.026668294270833334, "learning_rate": 0.0001, "loss": 4.2837, "loss/crossentropy": 2.0702155232429504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043909877538681, "step": 13840 }, { "epoch": 0.27684, "grad_norm": 2.015625, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 4.0732, "loss/crossentropy": 1.964758813381195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19540227204561234, "step": 13842 }, { "epoch": 0.27688, "grad_norm": 2.046875, "grad_norm_var": 0.0151611328125, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.0383411645889282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21456278860569, "step": 13844 }, { "epoch": 0.27692, "grad_norm": 1.8984375, "grad_norm_var": 0.015461222330729166, "learning_rate": 0.0001, "loss": 4.1407, "loss/crossentropy": 2.020030975341797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20521440356969833, "step": 13846 }, { "epoch": 0.27696, "grad_norm": 1.890625, "grad_norm_var": 0.016110992431640624, "learning_rate": 0.0001, "loss": 4.1142, "loss/crossentropy": 1.7514970302581787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19116375595331192, "step": 13848 }, { "epoch": 0.277, "grad_norm": 1.9296875, "grad_norm_var": 0.016437784830729166, "learning_rate": 0.0001, "loss": 3.9209, "loss/crossentropy": 1.8194025754928589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18198709934949875, "step": 13850 }, { "epoch": 0.27704, "grad_norm": 1.9375, "grad_norm_var": 0.017223866780598958, "learning_rate": 0.0001, "loss": 3.7911, "loss/crossentropy": 1.878225862979889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18159592151641846, "step": 13852 }, { "epoch": 0.27708, "grad_norm": 2.0, "grad_norm_var": 0.0178619384765625, "learning_rate": 0.0001, "loss": 4.008, "loss/crossentropy": 1.8619664311408997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18098927289247513, "step": 13854 }, { "epoch": 0.27712, "grad_norm": 2.046875, "grad_norm_var": 0.003474934895833333, "learning_rate": 0.0001, "loss": 4.2336, "loss/crossentropy": 2.0475016832351685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19999271631240845, "step": 13856 }, { "epoch": 0.27716, "grad_norm": 1.9453125, "grad_norm_var": 0.004443105061848958, "learning_rate": 0.0001, "loss": 4.2412, "loss/crossentropy": 2.2279679775238037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21767108887434006, "step": 13858 }, { "epoch": 0.2772, "grad_norm": 1.9453125, "grad_norm_var": 0.0039670308430989586, "learning_rate": 0.0001, "loss": 3.8087, "loss/crossentropy": 1.801176130771637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18873406946659088, "step": 13860 }, { "epoch": 0.27724, "grad_norm": 1.8046875, "grad_norm_var": 0.005303700764973958, "learning_rate": 0.0001, "loss": 3.7958, "loss/crossentropy": 1.948195457458496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19519966840744019, "step": 13862 }, { "epoch": 0.27728, "grad_norm": 1.90625, "grad_norm_var": 0.0050432840983072914, "learning_rate": 0.0001, "loss": 4.0754, "loss/crossentropy": 2.1044042110443115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20454693585634232, "step": 13864 }, { "epoch": 0.27732, "grad_norm": 2.015625, "grad_norm_var": 0.0051910400390625, "learning_rate": 0.0001, "loss": 4.1338, "loss/crossentropy": 2.383382737636566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179596871137619, "step": 13866 }, { "epoch": 0.27736, "grad_norm": 1.9375, "grad_norm_var": 0.004621378580729167, "learning_rate": 0.0001, "loss": 3.9475, "loss/crossentropy": 1.7798078656196594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17871373891830444, "step": 13868 }, { "epoch": 0.2774, "grad_norm": 1.984375, "grad_norm_var": 0.004756418863932291, "learning_rate": 0.0001, "loss": 3.9937, "loss/crossentropy": 2.1839531660079956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20030800253152847, "step": 13870 }, { "epoch": 0.27744, "grad_norm": 1.9921875, "grad_norm_var": 0.005028279622395834, "learning_rate": 0.0001, "loss": 4.1637, "loss/crossentropy": 1.990351676940918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951240971684456, "step": 13872 }, { "epoch": 0.27748, "grad_norm": 2.140625, "grad_norm_var": 0.006573232014973959, "learning_rate": 0.0001, "loss": 4.2919, "loss/crossentropy": 2.191170334815979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230004720389843, "step": 13874 }, { "epoch": 0.27752, "grad_norm": 1.953125, "grad_norm_var": 0.015726470947265626, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 1.8893607258796692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138589546084404, "step": 13876 }, { "epoch": 0.27756, "grad_norm": 1.953125, "grad_norm_var": 0.012483723958333333, "learning_rate": 0.0001, "loss": 4.2749, "loss/crossentropy": 2.059523820877075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20147833973169327, "step": 13878 }, { "epoch": 0.2776, "grad_norm": 2.125, "grad_norm_var": 0.013206990559895833, "learning_rate": 0.0001, "loss": 4.4959, "loss/crossentropy": 2.0867209434509277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23967817425727844, "step": 13880 }, { "epoch": 0.27764, "grad_norm": 2.15625, "grad_norm_var": 0.028393300374348958, "learning_rate": 0.0001, "loss": 4.5704, "loss/crossentropy": 2.3017314672470093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23674342036247253, "step": 13882 }, { "epoch": 0.27768, "grad_norm": 2.046875, "grad_norm_var": 0.0261138916015625, "learning_rate": 0.0001, "loss": 4.3316, "loss/crossentropy": 2.241922974586487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21731412410736084, "step": 13884 }, { "epoch": 0.27772, "grad_norm": 2.03125, "grad_norm_var": 0.025724283854166665, "learning_rate": 0.0001, "loss": 4.3317, "loss/crossentropy": 2.3800796270370483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916473656892776, "step": 13886 }, { "epoch": 0.27776, "grad_norm": 2.09375, "grad_norm_var": 0.02467625935872396, "learning_rate": 0.0001, "loss": 4.2972, "loss/crossentropy": 1.966018259525299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21862812340259552, "step": 13888 }, { "epoch": 0.2778, "grad_norm": 1.828125, "grad_norm_var": 0.030104319254557293, "learning_rate": 0.0001, "loss": 4.1955, "loss/crossentropy": 1.920684039592743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026856392621994, "step": 13890 }, { "epoch": 0.27784, "grad_norm": 2.0625, "grad_norm_var": 0.02601318359375, "learning_rate": 0.0001, "loss": 3.9662, "loss/crossentropy": 2.1939095854759216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20429420471191406, "step": 13892 }, { "epoch": 0.27788, "grad_norm": 1.9765625, "grad_norm_var": 0.026292928059895835, "learning_rate": 0.0001, "loss": 4.2335, "loss/crossentropy": 2.1600992679595947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20445890724658966, "step": 13894 }, { "epoch": 0.27792, "grad_norm": 1.9453125, "grad_norm_var": 0.026805623372395834, "learning_rate": 0.0001, "loss": 3.9973, "loss/crossentropy": 1.9898765683174133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19056615233421326, "step": 13896 }, { "epoch": 0.27796, "grad_norm": 1.859375, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.3037, "loss/crossentropy": 2.0497539043426514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878463476896286, "step": 13898 }, { "epoch": 0.278, "grad_norm": 1.9375, "grad_norm_var": 0.007373046875, "learning_rate": 0.0001, "loss": 4.2299, "loss/crossentropy": 1.894934356212616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.183290496468544, "step": 13900 }, { "epoch": 0.27804, "grad_norm": 1.9765625, "grad_norm_var": 0.010094960530598959, "learning_rate": 0.0001, "loss": 3.858, "loss/crossentropy": 1.808231770992279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19447006285190582, "step": 13902 }, { "epoch": 0.27808, "grad_norm": 2.03125, "grad_norm_var": 0.009688059488932291, "learning_rate": 0.0001, "loss": 4.0553, "loss/crossentropy": 1.9384279251098633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20293861627578735, "step": 13904 }, { "epoch": 0.27812, "grad_norm": 2.28125, "grad_norm_var": 0.012835439046223958, "learning_rate": 0.0001, "loss": 4.3092, "loss/crossentropy": 2.4291017055511475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24332769960165024, "step": 13906 }, { "epoch": 0.27816, "grad_norm": 2.03125, "grad_norm_var": 0.012669881184895834, "learning_rate": 0.0001, "loss": 4.331, "loss/crossentropy": 2.3018234968185425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21657106280326843, "step": 13908 }, { "epoch": 0.2782, "grad_norm": 2.390625, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 4.1756, "loss/crossentropy": 2.211503267288208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19193635880947113, "step": 13910 }, { "epoch": 0.27824, "grad_norm": 2.171875, "grad_norm_var": 0.02191162109375, "learning_rate": 0.0001, "loss": 4.372, "loss/crossentropy": 2.17062246799469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21989908069372177, "step": 13912 }, { "epoch": 0.27828, "grad_norm": 2.015625, "grad_norm_var": 0.018839518229166668, "learning_rate": 0.0001, "loss": 4.0895, "loss/crossentropy": 2.1446024775505066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2036902904510498, "step": 13914 }, { "epoch": 0.27832, "grad_norm": 2.015625, "grad_norm_var": 0.0177642822265625, "learning_rate": 0.0001, "loss": 4.0729, "loss/crossentropy": 1.8176022171974182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19358078390359879, "step": 13916 }, { "epoch": 0.27836, "grad_norm": 2.09375, "grad_norm_var": 0.01946996053059896, "learning_rate": 0.0001, "loss": 4.2376, "loss/crossentropy": 2.341936469078064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22822833061218262, "step": 13918 }, { "epoch": 0.2784, "grad_norm": 2.046875, "grad_norm_var": 0.01697565714518229, "learning_rate": 0.0001, "loss": 4.0943, "loss/crossentropy": 1.9315840601921082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17218464612960815, "step": 13920 }, { "epoch": 0.27844, "grad_norm": 1.890625, "grad_norm_var": 0.01615168253580729, "learning_rate": 0.0001, "loss": 4.1507, "loss/crossentropy": 1.9442673921585083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19567463546991348, "step": 13922 }, { "epoch": 0.27848, "grad_norm": 1.9921875, "grad_norm_var": 0.016673787434895834, "learning_rate": 0.0001, "loss": 4.1611, "loss/crossentropy": 2.3023592233657837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21824805438518524, "step": 13924 }, { "epoch": 0.27852, "grad_norm": 1.8359375, "grad_norm_var": 0.01170654296875, "learning_rate": 0.0001, "loss": 3.7557, "loss/crossentropy": 2.116863250732422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087356001138687, "step": 13926 }, { "epoch": 0.27856, "grad_norm": 1.953125, "grad_norm_var": 0.010936482747395834, "learning_rate": 0.0001, "loss": 4.3746, "loss/crossentropy": 2.0485963821411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18910974264144897, "step": 13928 }, { "epoch": 0.2786, "grad_norm": 2.0, "grad_norm_var": 0.024857584635416666, "learning_rate": 0.0001, "loss": 4.1309, "loss/crossentropy": 2.031753659248352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23851437866687775, "step": 13930 }, { "epoch": 0.27864, "grad_norm": 1.953125, "grad_norm_var": 0.025585683186848958, "learning_rate": 0.0001, "loss": 4.0688, "loss/crossentropy": 1.8019705414772034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18659329414367676, "step": 13932 }, { "epoch": 0.27868, "grad_norm": 2.109375, "grad_norm_var": 0.025243123372395832, "learning_rate": 0.0001, "loss": 3.9609, "loss/crossentropy": 1.9191861152648926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19393808394670486, "step": 13934 }, { "epoch": 0.27872, "grad_norm": 1.984375, "grad_norm_var": 0.025763956705729167, "learning_rate": 0.0001, "loss": 4.1765, "loss/crossentropy": 2.242986798286438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213445246219635, "step": 13936 }, { "epoch": 0.27876, "grad_norm": 2.09375, "grad_norm_var": 0.0300689697265625, "learning_rate": 0.0001, "loss": 4.1767, "loss/crossentropy": 2.1652570962905884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2070690169930458, "step": 13938 }, { "epoch": 0.2788, "grad_norm": 1.9140625, "grad_norm_var": 0.031583404541015624, "learning_rate": 0.0001, "loss": 3.9643, "loss/crossentropy": 2.0970187187194824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20118873566389084, "step": 13940 }, { "epoch": 0.27884, "grad_norm": 2.078125, "grad_norm_var": 0.02823053995768229, "learning_rate": 0.0001, "loss": 4.1116, "loss/crossentropy": 1.9957542419433594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2377041131258011, "step": 13942 }, { "epoch": 0.27888, "grad_norm": 2.0, "grad_norm_var": 0.02522761027018229, "learning_rate": 0.0001, "loss": 4.0718, "loss/crossentropy": 1.8680259585380554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919926255941391, "step": 13944 }, { "epoch": 0.27892, "grad_norm": 2.0625, "grad_norm_var": 0.015516916910807291, "learning_rate": 0.0001, "loss": 4.301, "loss/crossentropy": 2.369240164756775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24805612862110138, "step": 13946 }, { "epoch": 0.27896, "grad_norm": 2.0625, "grad_norm_var": 0.014530436197916666, "learning_rate": 0.0001, "loss": 4.3082, "loss/crossentropy": 2.0923795104026794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20823675394058228, "step": 13948 }, { "epoch": 0.279, "grad_norm": 2.1875, "grad_norm_var": 0.013197580973307291, "learning_rate": 0.0001, "loss": 4.3049, "loss/crossentropy": 2.1777398586273193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21319355070590973, "step": 13950 }, { "epoch": 0.27904, "grad_norm": 1.890625, "grad_norm_var": 0.014869944254557291, "learning_rate": 0.0001, "loss": 4.0918, "loss/crossentropy": 2.2155595421791077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029823139309883, "step": 13952 }, { "epoch": 0.27908, "grad_norm": 1.96875, "grad_norm_var": 0.011435699462890626, "learning_rate": 0.0001, "loss": 3.7195, "loss/crossentropy": 1.5916873812675476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18207873404026031, "step": 13954 }, { "epoch": 0.27912, "grad_norm": 2.09375, "grad_norm_var": 0.009791819254557292, "learning_rate": 0.0001, "loss": 4.3336, "loss/crossentropy": 2.165639281272888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22917281091213226, "step": 13956 }, { "epoch": 0.27916, "grad_norm": 1.8359375, "grad_norm_var": 0.012851715087890625, "learning_rate": 0.0001, "loss": 3.9586, "loss/crossentropy": 2.0245776772499084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19862860441207886, "step": 13958 }, { "epoch": 0.2792, "grad_norm": 2.0625, "grad_norm_var": 0.013646443684895834, "learning_rate": 0.0001, "loss": 4.1593, "loss/crossentropy": 2.0179941654205322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20749768614768982, "step": 13960 }, { "epoch": 0.27924, "grad_norm": 2.140625, "grad_norm_var": 0.011966705322265625, "learning_rate": 0.0001, "loss": 4.3978, "loss/crossentropy": 1.998136818408966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992153376340866, "step": 13962 }, { "epoch": 0.27928, "grad_norm": 1.890625, "grad_norm_var": 0.015346018473307292, "learning_rate": 0.0001, "loss": 4.1801, "loss/crossentropy": 2.041996657848358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22224828600883484, "step": 13964 }, { "epoch": 0.27932, "grad_norm": 2.140625, "grad_norm_var": 0.019406890869140624, "learning_rate": 0.0001, "loss": 4.4133, "loss/crossentropy": 2.0259117484092712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001984491944313, "step": 13966 }, { "epoch": 0.27936, "grad_norm": 2.015625, "grad_norm_var": 0.018436686197916666, "learning_rate": 0.0001, "loss": 4.0439, "loss/crossentropy": 1.8219285607337952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19637393951416016, "step": 13968 }, { "epoch": 0.2794, "grad_norm": 2.046875, "grad_norm_var": 0.017073567708333334, "learning_rate": 0.0001, "loss": 4.0294, "loss/crossentropy": 2.1451289653778076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21846695989370346, "step": 13970 }, { "epoch": 0.27944, "grad_norm": 2.125, "grad_norm_var": 0.016877237955729166, "learning_rate": 0.0001, "loss": 4.349, "loss/crossentropy": 2.0527132749557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147705778479576, "step": 13972 }, { "epoch": 0.27948, "grad_norm": 1.9921875, "grad_norm_var": 0.015143839518229167, "learning_rate": 0.0001, "loss": 4.0188, "loss/crossentropy": 1.9238123893737793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21083690226078033, "step": 13974 }, { "epoch": 0.27952, "grad_norm": 1.953125, "grad_norm_var": 0.015057118733723958, "learning_rate": 0.0001, "loss": 4.0072, "loss/crossentropy": 1.894473671913147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20077680051326752, "step": 13976 }, { "epoch": 0.27956, "grad_norm": 2.03125, "grad_norm_var": 0.013871256510416667, "learning_rate": 0.0001, "loss": 4.2344, "loss/crossentropy": 2.139513611793518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138260081410408, "step": 13978 }, { "epoch": 0.2796, "grad_norm": 2.171875, "grad_norm_var": 0.013468424479166666, "learning_rate": 0.0001, "loss": 4.4519, "loss/crossentropy": 2.4868407249450684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23635651916265488, "step": 13980 }, { "epoch": 0.27964, "grad_norm": 1.9921875, "grad_norm_var": 0.007940419514973958, "learning_rate": 0.0001, "loss": 3.7428, "loss/crossentropy": 1.426945686340332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16342981159687042, "step": 13982 }, { "epoch": 0.27968, "grad_norm": 1.9609375, "grad_norm_var": 0.007885487874348958, "learning_rate": 0.0001, "loss": 4.0457, "loss/crossentropy": 2.0577683448791504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20394782721996307, "step": 13984 }, { "epoch": 0.27972, "grad_norm": 2.0625, "grad_norm_var": 0.008063761393229167, "learning_rate": 0.0001, "loss": 4.0045, "loss/crossentropy": 1.884658396244049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18946699798107147, "step": 13986 }, { "epoch": 0.27976, "grad_norm": 1.984375, "grad_norm_var": 0.0070383707682291664, "learning_rate": 0.0001, "loss": 3.9423, "loss/crossentropy": 1.7344964742660522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923774629831314, "step": 13988 }, { "epoch": 0.2798, "grad_norm": 1.84375, "grad_norm_var": 0.00888671875, "learning_rate": 0.0001, "loss": 3.9541, "loss/crossentropy": 1.9660141468048096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18560642004013062, "step": 13990 }, { "epoch": 0.27984, "grad_norm": 2.046875, "grad_norm_var": 0.008998362223307292, "learning_rate": 0.0001, "loss": 3.9308, "loss/crossentropy": 1.7832527160644531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19922740012407303, "step": 13992 }, { "epoch": 0.27988, "grad_norm": 2.078125, "grad_norm_var": 0.009987131754557291, "learning_rate": 0.0001, "loss": 4.3919, "loss/crossentropy": 1.9763087630271912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111397087574005, "step": 13994 }, { "epoch": 0.27992, "grad_norm": 2.0625, "grad_norm_var": 0.005252838134765625, "learning_rate": 0.0001, "loss": 4.1331, "loss/crossentropy": 2.262092709541321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21212221682071686, "step": 13996 }, { "epoch": 0.27996, "grad_norm": 2.015625, "grad_norm_var": 0.0054595947265625, "learning_rate": 0.0001, "loss": 4.1946, "loss/crossentropy": 2.237086296081543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20895393192768097, "step": 13998 }, { "epoch": 0.28, "grad_norm": 1.890625, "grad_norm_var": 0.0061337788899739586, "learning_rate": 0.0001, "loss": 4.063, "loss/crossentropy": 1.9579973816871643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18024132400751114, "step": 14000 }, { "epoch": 0.28004, "grad_norm": 2.015625, "grad_norm_var": 0.0055844624837239586, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 2.2264128923416138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181634083390236, "step": 14002 }, { "epoch": 0.28008, "grad_norm": 1.9453125, "grad_norm_var": 0.0058013916015625, "learning_rate": 0.0001, "loss": 4.0081, "loss/crossentropy": 1.836763322353363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983085498213768, "step": 14004 }, { "epoch": 0.28012, "grad_norm": 1.8515625, "grad_norm_var": 0.005448404947916667, "learning_rate": 0.0001, "loss": 4.1192, "loss/crossentropy": 2.0163257718086243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214015431702137, "step": 14006 }, { "epoch": 0.28016, "grad_norm": 2.140625, "grad_norm_var": 0.0070383707682291664, "learning_rate": 0.0001, "loss": 4.139, "loss/crossentropy": 2.31567645072937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832776606082916, "step": 14008 }, { "epoch": 0.2802, "grad_norm": 1.9140625, "grad_norm_var": 0.006685129801432292, "learning_rate": 0.0001, "loss": 4.0753, "loss/crossentropy": 2.3515210151672363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21946918964385986, "step": 14010 }, { "epoch": 0.28024, "grad_norm": 2.0, "grad_norm_var": 0.006845855712890625, "learning_rate": 0.0001, "loss": 3.9761, "loss/crossentropy": 2.0987448692321777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180740088224411, "step": 14012 }, { "epoch": 0.28028, "grad_norm": 2.015625, "grad_norm_var": 0.00662841796875, "learning_rate": 0.0001, "loss": 4.0909, "loss/crossentropy": 2.2203436493873596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157250940799713, "step": 14014 }, { "epoch": 0.28032, "grad_norm": 1.953125, "grad_norm_var": 0.006241861979166667, "learning_rate": 0.0001, "loss": 4.3152, "loss/crossentropy": 2.083233594894409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187238186597824, "step": 14016 }, { "epoch": 0.28036, "grad_norm": 2.015625, "grad_norm_var": 0.0061075846354166664, "learning_rate": 0.0001, "loss": 4.1311, "loss/crossentropy": 2.1261327266693115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063112109899521, "step": 14018 }, { "epoch": 0.2804, "grad_norm": 2.109375, "grad_norm_var": 0.006648508707682291, "learning_rate": 0.0001, "loss": 4.0901, "loss/crossentropy": 2.012390434741974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20225800573825836, "step": 14020 }, { "epoch": 0.28044, "grad_norm": 2.046875, "grad_norm_var": 0.0047271728515625, "learning_rate": 0.0001, "loss": 3.9985, "loss/crossentropy": 1.8605966567993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003621682524681, "step": 14022 }, { "epoch": 0.28048, "grad_norm": 1.9921875, "grad_norm_var": 0.0028157552083333333, "learning_rate": 0.0001, "loss": 3.9774, "loss/crossentropy": 1.9208670258522034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20056191086769104, "step": 14024 }, { "epoch": 0.28052, "grad_norm": 2.046875, "grad_norm_var": 0.003951009114583333, "learning_rate": 0.0001, "loss": 3.7462, "loss/crossentropy": 1.924518644809723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19410791993141174, "step": 14026 }, { "epoch": 0.28056, "grad_norm": 1.9921875, "grad_norm_var": 0.0035113016764322918, "learning_rate": 0.0001, "loss": 4.0319, "loss/crossentropy": 2.103038251399994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054024636745453, "step": 14028 }, { "epoch": 0.2806, "grad_norm": 1.8984375, "grad_norm_var": 0.004303995768229167, "learning_rate": 0.0001, "loss": 4.0526, "loss/crossentropy": 1.8974853157997131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18592579662799835, "step": 14030 }, { "epoch": 0.28064, "grad_norm": 2.03125, "grad_norm_var": 0.004447428385416666, "learning_rate": 0.0001, "loss": 4.1735, "loss/crossentropy": 2.0207908749580383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216909758746624, "step": 14032 }, { "epoch": 0.28068, "grad_norm": 2.171875, "grad_norm_var": 0.006119537353515625, "learning_rate": 0.0001, "loss": 4.5759, "loss/crossentropy": 2.0010873079299927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20815415680408478, "step": 14034 }, { "epoch": 0.28072, "grad_norm": 1.9609375, "grad_norm_var": 0.006961822509765625, "learning_rate": 0.0001, "loss": 3.9162, "loss/crossentropy": 2.0840484499931335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21481210738420486, "step": 14036 }, { "epoch": 0.28076, "grad_norm": 2.03125, "grad_norm_var": 0.006880442301432292, "learning_rate": 0.0001, "loss": 4.1298, "loss/crossentropy": 2.0026179552078247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1904347836971283, "step": 14038 }, { "epoch": 0.2808, "grad_norm": 1.9921875, "grad_norm_var": 0.006880442301432292, "learning_rate": 0.0001, "loss": 4.0618, "loss/crossentropy": 2.1660486459732056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21236423403024673, "step": 14040 }, { "epoch": 0.28084, "grad_norm": 2.15625, "grad_norm_var": 0.007669830322265625, "learning_rate": 0.0001, "loss": 4.0474, "loss/crossentropy": 2.312765598297119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20535308122634888, "step": 14042 }, { "epoch": 0.28088, "grad_norm": 1.984375, "grad_norm_var": 0.0077789306640625, "learning_rate": 0.0001, "loss": 4.1959, "loss/crossentropy": 1.8449203372001648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986582651734352, "step": 14044 }, { "epoch": 0.28092, "grad_norm": 1.8359375, "grad_norm_var": 0.008894602457682291, "learning_rate": 0.0001, "loss": 4.016, "loss/crossentropy": 2.0159433484077454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19724585115909576, "step": 14046 }, { "epoch": 0.28096, "grad_norm": 1.9765625, "grad_norm_var": 0.0086090087890625, "learning_rate": 0.0001, "loss": 4.0691, "loss/crossentropy": 2.138873815536499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438196301460266, "step": 14048 }, { "epoch": 0.281, "grad_norm": 2.25, "grad_norm_var": 0.011372629801432292, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 1.6896708607673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19260118901729584, "step": 14050 }, { "epoch": 0.28104, "grad_norm": 1.828125, "grad_norm_var": 0.014656321207682291, "learning_rate": 0.0001, "loss": 3.8383, "loss/crossentropy": 1.8495243191719055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18072611093521118, "step": 14052 }, { "epoch": 0.28108, "grad_norm": 1.9765625, "grad_norm_var": 0.017354329427083332, "learning_rate": 0.0001, "loss": 4.3581, "loss/crossentropy": 2.193492293357849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2247578352689743, "step": 14054 }, { "epoch": 0.28112, "grad_norm": 1.96875, "grad_norm_var": 0.01761652628580729, "learning_rate": 0.0001, "loss": 4.1252, "loss/crossentropy": 2.154744803905487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20149191468954086, "step": 14056 }, { "epoch": 0.28116, "grad_norm": 1.9921875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 4.0832, "loss/crossentropy": 2.137023687362671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20409516990184784, "step": 14058 }, { "epoch": 0.2812, "grad_norm": 2.96875, "grad_norm_var": 0.0767242431640625, "learning_rate": 0.0001, "loss": 4.6582, "loss/crossentropy": 2.4530670642852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24031317979097366, "step": 14060 }, { "epoch": 0.28124, "grad_norm": 2.265625, "grad_norm_var": 0.07713394165039063, "learning_rate": 0.0001, "loss": 4.2973, "loss/crossentropy": 2.3285664319992065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223993182182312, "step": 14062 }, { "epoch": 0.28128, "grad_norm": 1.90625, "grad_norm_var": 0.0784088134765625, "learning_rate": 0.0001, "loss": 4.0899, "loss/crossentropy": 2.126828193664551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21428050100803375, "step": 14064 }, { "epoch": 0.28132, "grad_norm": 1.796875, "grad_norm_var": 0.08012288411458333, "learning_rate": 0.0001, "loss": 3.8083, "loss/crossentropy": 2.0194268226623535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20124298334121704, "step": 14066 }, { "epoch": 0.28136, "grad_norm": 2.125, "grad_norm_var": 0.07445882161458334, "learning_rate": 0.0001, "loss": 4.0002, "loss/crossentropy": 1.796358585357666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1902359575033188, "step": 14068 }, { "epoch": 0.2814, "grad_norm": 2.15625, "grad_norm_var": 0.07318700154622396, "learning_rate": 0.0001, "loss": 4.2, "loss/crossentropy": 1.8975006341934204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19528520107269287, "step": 14070 }, { "epoch": 0.28144, "grad_norm": 1.9296875, "grad_norm_var": 0.07691141764322916, "learning_rate": 0.0001, "loss": 3.9885, "loss/crossentropy": 1.628948986530304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1823427528142929, "step": 14072 }, { "epoch": 0.28148, "grad_norm": 1.9140625, "grad_norm_var": 0.07718480428059896, "learning_rate": 0.0001, "loss": 4.0404, "loss/crossentropy": 1.9894697070121765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945328414440155, "step": 14074 }, { "epoch": 0.28152, "grad_norm": 2.125, "grad_norm_var": 0.022489166259765624, "learning_rate": 0.0001, "loss": 4.0927, "loss/crossentropy": 1.9978017210960388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21055076271295547, "step": 14076 }, { "epoch": 0.28156, "grad_norm": 1.921875, "grad_norm_var": 0.01697565714518229, "learning_rate": 0.0001, "loss": 3.9224, "loss/crossentropy": 2.087389588356018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19179877638816833, "step": 14078 }, { "epoch": 0.2816, "grad_norm": 1.96875, "grad_norm_var": 0.018308258056640624, "learning_rate": 0.0001, "loss": 4.3356, "loss/crossentropy": 2.3281190395355225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2292996421456337, "step": 14080 }, { "epoch": 0.28164, "grad_norm": 1.953125, "grad_norm_var": 0.015215810139973958, "learning_rate": 0.0001, "loss": 3.8584, "loss/crossentropy": 1.8850311040878296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19652675837278366, "step": 14082 }, { "epoch": 0.28168, "grad_norm": 1.8671875, "grad_norm_var": 0.015604654947916666, "learning_rate": 0.0001, "loss": 3.9013, "loss/crossentropy": 1.757595181465149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17358998954296112, "step": 14084 }, { "epoch": 0.28172, "grad_norm": 2.015625, "grad_norm_var": 0.01387939453125, "learning_rate": 0.0001, "loss": 4.1367, "loss/crossentropy": 2.173685908317566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22043757140636444, "step": 14086 }, { "epoch": 0.28176, "grad_norm": 1.8203125, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 4.0381, "loss/crossentropy": 2.0290868282318115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19695669412612915, "step": 14088 }, { "epoch": 0.2818, "grad_norm": 2.125, "grad_norm_var": 0.010609690348307292, "learning_rate": 0.0001, "loss": 4.3497, "loss/crossentropy": 2.320943236351013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21747629344463348, "step": 14090 }, { "epoch": 0.28184, "grad_norm": 1.9296875, "grad_norm_var": 0.011205037434895834, "learning_rate": 0.0001, "loss": 4.0918, "loss/crossentropy": 2.0311750173568726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19410807639360428, "step": 14092 }, { "epoch": 0.28188, "grad_norm": 2.03125, "grad_norm_var": 0.011207834879557291, "learning_rate": 0.0001, "loss": 4.1303, "loss/crossentropy": 2.164630174636841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962611824274063, "step": 14094 }, { "epoch": 0.28192, "grad_norm": 1.9453125, "grad_norm_var": 0.0093658447265625, "learning_rate": 0.0001, "loss": 3.9681, "loss/crossentropy": 2.170566439628601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24082274734973907, "step": 14096 }, { "epoch": 0.28196, "grad_norm": 2.0, "grad_norm_var": 0.009285227457682291, "learning_rate": 0.0001, "loss": 4.209, "loss/crossentropy": 2.3137707710266113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22475039958953857, "step": 14098 }, { "epoch": 0.282, "grad_norm": 1.7421875, "grad_norm_var": 0.010794830322265626, "learning_rate": 0.0001, "loss": 3.7432, "loss/crossentropy": 2.0448675751686096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19814980030059814, "step": 14100 }, { "epoch": 0.28204, "grad_norm": 1.96875, "grad_norm_var": 0.011423492431640625, "learning_rate": 0.0001, "loss": 4.1729, "loss/crossentropy": 2.095334231853485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19743670523166656, "step": 14102 }, { "epoch": 0.28208, "grad_norm": 1.9765625, "grad_norm_var": 0.008737945556640625, "learning_rate": 0.0001, "loss": 4.0362, "loss/crossentropy": 2.280429720878601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21895557641983032, "step": 14104 }, { "epoch": 0.28212, "grad_norm": 2.09375, "grad_norm_var": 0.008292388916015626, "learning_rate": 0.0001, "loss": 4.1166, "loss/crossentropy": 1.8548901677131653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19965855032205582, "step": 14106 }, { "epoch": 0.28216, "grad_norm": 2.234375, "grad_norm_var": 0.01163330078125, "learning_rate": 0.0001, "loss": 4.2589, "loss/crossentropy": 2.3574371337890625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23160286247730255, "step": 14108 }, { "epoch": 0.2822, "grad_norm": 2.078125, "grad_norm_var": 0.012800089518229167, "learning_rate": 0.0001, "loss": 3.932, "loss/crossentropy": 2.112724542617798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19597014784812927, "step": 14110 }, { "epoch": 0.28224, "grad_norm": 1.8828125, "grad_norm_var": 0.013068644205729167, "learning_rate": 0.0001, "loss": 3.8792, "loss/crossentropy": 1.9877798557281494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21120092272758484, "step": 14112 }, { "epoch": 0.28228, "grad_norm": 2.015625, "grad_norm_var": 0.013199869791666667, "learning_rate": 0.0001, "loss": 4.1761, "loss/crossentropy": 2.253028154373169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046835869550705, "step": 14114 }, { "epoch": 0.28232, "grad_norm": 1.9921875, "grad_norm_var": 0.008561197916666667, "learning_rate": 0.0001, "loss": 4.1711, "loss/crossentropy": 2.322582960128784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22178008407354355, "step": 14116 }, { "epoch": 0.28236, "grad_norm": 2.03125, "grad_norm_var": 0.008204905192057292, "learning_rate": 0.0001, "loss": 4.1584, "loss/crossentropy": 2.470620036125183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23360955715179443, "step": 14118 }, { "epoch": 0.2824, "grad_norm": 1.90625, "grad_norm_var": 0.0086334228515625, "learning_rate": 0.0001, "loss": 4.0206, "loss/crossentropy": 1.9218884110450745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1978505551815033, "step": 14120 }, { "epoch": 0.28244, "grad_norm": 2.125, "grad_norm_var": 0.0092437744140625, "learning_rate": 0.0001, "loss": 4.2045, "loss/crossentropy": 1.9114753007888794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19295217841863632, "step": 14122 }, { "epoch": 0.28248, "grad_norm": 2.015625, "grad_norm_var": 0.498583984375, "learning_rate": 0.0001, "loss": 4.2259, "loss/crossentropy": 2.1196082830429077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068253606557846, "step": 14124 }, { "epoch": 0.28252, "grad_norm": 2.046875, "grad_norm_var": 0.492419179280599, "learning_rate": 0.0001, "loss": 4.2616, "loss/crossentropy": 1.9869969487190247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1950489580631256, "step": 14126 }, { "epoch": 0.28256, "grad_norm": 1.9609375, "grad_norm_var": 0.488177235921224, "learning_rate": 0.0001, "loss": 4.1635, "loss/crossentropy": 2.09599232673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22184181958436966, "step": 14128 }, { "epoch": 0.2826, "grad_norm": 2.046875, "grad_norm_var": 0.489172108968099, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 2.1891895532608032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229236587882042, "step": 14130 }, { "epoch": 0.28264, "grad_norm": 2.171875, "grad_norm_var": 0.48787816365559894, "learning_rate": 0.0001, "loss": 4.12, "loss/crossentropy": 2.5070972442626953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21862925589084625, "step": 14132 }, { "epoch": 0.28268, "grad_norm": 2.03125, "grad_norm_var": 0.48812662760416664, "learning_rate": 0.0001, "loss": 4.0993, "loss/crossentropy": 2.100346863269806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19969436526298523, "step": 14134 }, { "epoch": 0.28272, "grad_norm": 2.1875, "grad_norm_var": 0.48713150024414065, "learning_rate": 0.0001, "loss": 4.2607, "loss/crossentropy": 2.0019801259040833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21319210529327393, "step": 14136 }, { "epoch": 0.28276, "grad_norm": 2.296875, "grad_norm_var": 0.4855323791503906, "learning_rate": 0.0001, "loss": 4.7069, "loss/crossentropy": 2.2734099626541138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24008513987064362, "step": 14138 }, { "epoch": 0.2828, "grad_norm": 2.484375, "grad_norm_var": 0.02581965128580729, "learning_rate": 0.0001, "loss": 4.1085, "loss/crossentropy": 2.2589277029037476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2224387675523758, "step": 14140 }, { "epoch": 0.28284, "grad_norm": 1.9609375, "grad_norm_var": 0.02667236328125, "learning_rate": 0.0001, "loss": 4.2749, "loss/crossentropy": 1.713346004486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19133105129003525, "step": 14142 }, { "epoch": 0.28288, "grad_norm": 2.0, "grad_norm_var": 0.02616144816080729, "learning_rate": 0.0001, "loss": 4.1792, "loss/crossentropy": 2.212652564048767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21326624602079391, "step": 14144 }, { "epoch": 0.28292, "grad_norm": 2.046875, "grad_norm_var": 0.02902399698893229, "learning_rate": 0.0001, "loss": 3.901, "loss/crossentropy": 1.8900847434997559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882227212190628, "step": 14146 }, { "epoch": 0.28296, "grad_norm": 2.25, "grad_norm_var": 0.028913370768229165, "learning_rate": 0.0001, "loss": 4.3775, "loss/crossentropy": 2.171455979347229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20516864210367203, "step": 14148 }, { "epoch": 0.283, "grad_norm": 2.046875, "grad_norm_var": 0.027608235677083332, "learning_rate": 0.0001, "loss": 4.3466, "loss/crossentropy": 2.0112149715423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20043446868658066, "step": 14150 }, { "epoch": 0.28304, "grad_norm": 1.96875, "grad_norm_var": 0.026596832275390624, "learning_rate": 0.0001, "loss": 4.2879, "loss/crossentropy": 2.0091559886932373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21023409068584442, "step": 14152 }, { "epoch": 0.28308, "grad_norm": 1.9296875, "grad_norm_var": 0.023045857747395832, "learning_rate": 0.0001, "loss": 3.9234, "loss/crossentropy": 2.122502863407135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023375853896141, "step": 14154 }, { "epoch": 0.28312, "grad_norm": 1.9609375, "grad_norm_var": 0.011462148030598958, "learning_rate": 0.0001, "loss": 4.3187, "loss/crossentropy": 2.2842462062835693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24532422423362732, "step": 14156 }, { "epoch": 0.28316, "grad_norm": 2.078125, "grad_norm_var": 0.010990397135416666, "learning_rate": 0.0001, "loss": 4.5248, "loss/crossentropy": 2.247039318084717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24580712616443634, "step": 14158 }, { "epoch": 0.2832, "grad_norm": 1.9609375, "grad_norm_var": 0.011432902018229166, "learning_rate": 0.0001, "loss": 3.9486, "loss/crossentropy": 2.182216167449951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21095345169305801, "step": 14160 }, { "epoch": 0.28324, "grad_norm": 1.9921875, "grad_norm_var": 0.008648427327473958, "learning_rate": 0.0001, "loss": 4.4582, "loss/crossentropy": 2.219786763191223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21747471392154694, "step": 14162 }, { "epoch": 0.28328, "grad_norm": 1.9453125, "grad_norm_var": 0.005830637613932292, "learning_rate": 0.0001, "loss": 3.9483, "loss/crossentropy": 1.8589028716087341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19629907608032227, "step": 14164 }, { "epoch": 0.28332, "grad_norm": 1.9140625, "grad_norm_var": 0.0138092041015625, "learning_rate": 0.0001, "loss": 4.0217, "loss/crossentropy": 1.9820821285247803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20433218777179718, "step": 14166 }, { "epoch": 0.28336, "grad_norm": 1.9296875, "grad_norm_var": 0.014216868082682292, "learning_rate": 0.0001, "loss": 4.0081, "loss/crossentropy": 1.8423896431922913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18210452795028687, "step": 14168 }, { "epoch": 0.2834, "grad_norm": 2.171875, "grad_norm_var": 0.0158203125, "learning_rate": 0.0001, "loss": 4.1244, "loss/crossentropy": 2.0365665555000305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21229179203510284, "step": 14170 }, { "epoch": 0.28344, "grad_norm": 1.7578125, "grad_norm_var": 0.019212849934895835, "learning_rate": 0.0001, "loss": 3.7894, "loss/crossentropy": 1.6819360256195068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20171043276786804, "step": 14172 }, { "epoch": 0.28348, "grad_norm": 1.8828125, "grad_norm_var": 0.019846343994140626, "learning_rate": 0.0001, "loss": 4.1247, "loss/crossentropy": 1.9932513236999512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21374855190515518, "step": 14174 }, { "epoch": 0.28352, "grad_norm": 1.96875, "grad_norm_var": 0.01975072224934896, "learning_rate": 0.0001, "loss": 3.8874, "loss/crossentropy": 1.7137236595153809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17538649588823318, "step": 14176 }, { "epoch": 0.28356, "grad_norm": 1.953125, "grad_norm_var": 0.01977717081705729, "learning_rate": 0.0001, "loss": 3.8762, "loss/crossentropy": 1.7120450735092163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19030355662107468, "step": 14178 }, { "epoch": 0.2836, "grad_norm": 1.9765625, "grad_norm_var": 0.019962565104166666, "learning_rate": 0.0001, "loss": 4.4603, "loss/crossentropy": 2.346954107284546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24070723354816437, "step": 14180 }, { "epoch": 0.28364, "grad_norm": 1.953125, "grad_norm_var": 0.013755035400390626, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 2.241390824317932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2247970700263977, "step": 14182 }, { "epoch": 0.28368, "grad_norm": 1.890625, "grad_norm_var": 0.02080078125, "learning_rate": 0.0001, "loss": 4.206, "loss/crossentropy": 2.234646439552307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21947349607944489, "step": 14184 }, { "epoch": 0.28372, "grad_norm": 1.9375, "grad_norm_var": 0.019010416666666665, "learning_rate": 0.0001, "loss": 4.0446, "loss/crossentropy": 1.9331231117248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20428168773651123, "step": 14186 }, { "epoch": 0.28376, "grad_norm": 1.9765625, "grad_norm_var": 0.015315500895182292, "learning_rate": 0.0001, "loss": 4.0184, "loss/crossentropy": 1.8166351318359375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21285146474838257, "step": 14188 }, { "epoch": 0.2838, "grad_norm": 1.9765625, "grad_norm_var": 0.015077463785807292, "learning_rate": 0.0001, "loss": 4.313, "loss/crossentropy": 2.181701421737671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20924244076013565, "step": 14190 }, { "epoch": 0.28384, "grad_norm": 2.078125, "grad_norm_var": 0.015558878580729166, "learning_rate": 0.0001, "loss": 4.0866, "loss/crossentropy": 2.1605955958366394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24021095037460327, "step": 14192 }, { "epoch": 0.28388, "grad_norm": 2.1875, "grad_norm_var": 0.015455881754557291, "learning_rate": 0.0001, "loss": 4.1617, "loss/crossentropy": 2.3546024560928345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24228987097740173, "step": 14194 }, { "epoch": 0.28392, "grad_norm": 2.015625, "grad_norm_var": 0.015314737955729166, "learning_rate": 0.0001, "loss": 4.0791, "loss/crossentropy": 2.2335511445999146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21917300671339035, "step": 14196 }, { "epoch": 0.28396, "grad_norm": 2.0, "grad_norm_var": 0.012760416666666666, "learning_rate": 0.0001, "loss": 3.9915, "loss/crossentropy": 1.8429189324378967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18830660730600357, "step": 14198 }, { "epoch": 0.284, "grad_norm": 2.015625, "grad_norm_var": 0.0058746337890625, "learning_rate": 0.0001, "loss": 4.2671, "loss/crossentropy": 2.4578241109848022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23527230322360992, "step": 14200 }, { "epoch": 0.28404, "grad_norm": 2.03125, "grad_norm_var": 0.00592041015625, "learning_rate": 0.0001, "loss": 4.3027, "loss/crossentropy": 2.0944234132766724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211083546280861, "step": 14202 }, { "epoch": 0.28408, "grad_norm": 1.90625, "grad_norm_var": 0.0066070556640625, "learning_rate": 0.0001, "loss": 3.9341, "loss/crossentropy": 1.8876588344573975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1908133327960968, "step": 14204 }, { "epoch": 0.28412, "grad_norm": 2.0625, "grad_norm_var": 0.006190745035807291, "learning_rate": 0.0001, "loss": 4.368, "loss/crossentropy": 2.0882590413093567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21159439533948898, "step": 14206 }, { "epoch": 0.28416, "grad_norm": 2.015625, "grad_norm_var": 0.005296834309895833, "learning_rate": 0.0001, "loss": 4.3005, "loss/crossentropy": 2.219560742378235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2282530516386032, "step": 14208 }, { "epoch": 0.2842, "grad_norm": 2.015625, "grad_norm_var": 0.004759724934895833, "learning_rate": 0.0001, "loss": 3.9308, "loss/crossentropy": 1.8383984565734863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1868659406900406, "step": 14210 }, { "epoch": 0.28424, "grad_norm": 2.125, "grad_norm_var": 0.006892649332682291, "learning_rate": 0.0001, "loss": 3.9613, "loss/crossentropy": 1.7509828209877014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1862550526857376, "step": 14212 }, { "epoch": 0.28428, "grad_norm": 1.9609375, "grad_norm_var": 0.007513173421223958, "learning_rate": 0.0001, "loss": 4.1301, "loss/crossentropy": 2.1544927954673767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19378253817558289, "step": 14214 }, { "epoch": 0.28432, "grad_norm": 2.03125, "grad_norm_var": 0.007533518473307291, "learning_rate": 0.0001, "loss": 4.1261, "loss/crossentropy": 2.498751997947693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22881630063056946, "step": 14216 }, { "epoch": 0.28436, "grad_norm": 2.09375, "grad_norm_var": 0.006780751546223958, "learning_rate": 0.0001, "loss": 4.3414, "loss/crossentropy": 2.2638392448425293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21407127380371094, "step": 14218 }, { "epoch": 0.2844, "grad_norm": 2.015625, "grad_norm_var": 0.006380208333333333, "learning_rate": 0.0001, "loss": 4.0469, "loss/crossentropy": 2.097207546234131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21202068030834198, "step": 14220 }, { "epoch": 0.28444, "grad_norm": 2.0625, "grad_norm_var": 0.007661946614583333, "learning_rate": 0.0001, "loss": 4.211, "loss/crossentropy": 2.0577695965766907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21462847292423248, "step": 14222 }, { "epoch": 0.28448, "grad_norm": 1.9921875, "grad_norm_var": 0.008373769124348958, "learning_rate": 0.0001, "loss": 4.2646, "loss/crossentropy": 2.3061472177505493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23122046887874603, "step": 14224 }, { "epoch": 0.28452, "grad_norm": 1.9921875, "grad_norm_var": 0.0068511962890625, "learning_rate": 0.0001, "loss": 3.947, "loss/crossentropy": 1.850829005241394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19229399412870407, "step": 14226 }, { "epoch": 0.28456, "grad_norm": 1.9375, "grad_norm_var": 0.0051348368326822914, "learning_rate": 0.0001, "loss": 4.0891, "loss/crossentropy": 1.9520751237869263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19159140437841415, "step": 14228 }, { "epoch": 0.2846, "grad_norm": 1.875, "grad_norm_var": 0.005145009358723958, "learning_rate": 0.0001, "loss": 4.2161, "loss/crossentropy": 2.507380247116089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23365024477243423, "step": 14230 }, { "epoch": 0.28464, "grad_norm": 2.046875, "grad_norm_var": 0.006261952718098958, "learning_rate": 0.0001, "loss": 4.0787, "loss/crossentropy": 1.9148982763290405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19785508513450623, "step": 14232 }, { "epoch": 0.28468, "grad_norm": 1.8984375, "grad_norm_var": 0.007769521077473958, "learning_rate": 0.0001, "loss": 4.1882, "loss/crossentropy": 2.004162549972534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192252054810524, "step": 14234 }, { "epoch": 0.28472, "grad_norm": 1.9375, "grad_norm_var": 0.0078277587890625, "learning_rate": 0.0001, "loss": 4.1318, "loss/crossentropy": 2.1455901861190796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19151784479618073, "step": 14236 }, { "epoch": 0.28476, "grad_norm": 1.9765625, "grad_norm_var": 0.005580393473307291, "learning_rate": 0.0001, "loss": 3.9548, "loss/crossentropy": 2.2556833028793335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20640526711940765, "step": 14238 }, { "epoch": 0.2848, "grad_norm": 1.8828125, "grad_norm_var": 0.013698069254557292, "learning_rate": 0.0001, "loss": 4.3805, "loss/crossentropy": 2.1121991872787476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21154547482728958, "step": 14240 }, { "epoch": 0.28484, "grad_norm": 1.7890625, "grad_norm_var": 0.01586888631184896, "learning_rate": 0.0001, "loss": 3.7502, "loss/crossentropy": 1.9072884321212769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19220874458551407, "step": 14242 }, { "epoch": 0.28488, "grad_norm": 2.015625, "grad_norm_var": 0.0154052734375, "learning_rate": 0.0001, "loss": 4.1925, "loss/crossentropy": 1.907566249370575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355048030614853, "step": 14244 }, { "epoch": 0.28492, "grad_norm": 1.859375, "grad_norm_var": 0.0149169921875, "learning_rate": 0.0001, "loss": 3.8873, "loss/crossentropy": 2.1140421628952026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20535241812467575, "step": 14246 }, { "epoch": 0.28496, "grad_norm": 2.125, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 4.1704, "loss/crossentropy": 2.192967176437378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20353975892066956, "step": 14248 }, { "epoch": 0.285, "grad_norm": 1.90625, "grad_norm_var": 0.015213775634765624, "learning_rate": 0.0001, "loss": 3.9064, "loss/crossentropy": 1.6658200025558472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17274750024080276, "step": 14250 }, { "epoch": 0.28504, "grad_norm": 2.171875, "grad_norm_var": 0.017411041259765624, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.014496326446533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203952856361866, "step": 14252 }, { "epoch": 0.28508, "grad_norm": 2.15625, "grad_norm_var": 0.0249908447265625, "learning_rate": 0.0001, "loss": 4.4607, "loss/crossentropy": 2.1534151434898376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23526588827371597, "step": 14254 }, { "epoch": 0.28512, "grad_norm": 2.0625, "grad_norm_var": 0.01701838175455729, "learning_rate": 0.0001, "loss": 4.1263, "loss/crossentropy": 2.08686763048172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20443058758974075, "step": 14256 }, { "epoch": 0.28516, "grad_norm": 2.0625, "grad_norm_var": 0.0135406494140625, "learning_rate": 0.0001, "loss": 4.314, "loss/crossentropy": 1.9428821802139282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20955658704042435, "step": 14258 }, { "epoch": 0.2852, "grad_norm": 1.921875, "grad_norm_var": 0.015710194905598957, "learning_rate": 0.0001, "loss": 3.9157, "loss/crossentropy": 1.8959371447563171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20648592710494995, "step": 14260 }, { "epoch": 0.28524, "grad_norm": 1.8203125, "grad_norm_var": 0.017081705729166667, "learning_rate": 0.0001, "loss": 3.8246, "loss/crossentropy": 1.7842467427253723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17766441404819489, "step": 14262 }, { "epoch": 0.28528, "grad_norm": 1.84375, "grad_norm_var": 0.018805948893229167, "learning_rate": 0.0001, "loss": 4.0581, "loss/crossentropy": 1.989040195941925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20939016342163086, "step": 14264 }, { "epoch": 0.28532, "grad_norm": 1.953125, "grad_norm_var": 0.018656158447265626, "learning_rate": 0.0001, "loss": 3.9216, "loss/crossentropy": 2.0441301465034485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20979472994804382, "step": 14266 }, { "epoch": 0.28536, "grad_norm": 1.875, "grad_norm_var": 0.017970530192057292, "learning_rate": 0.0001, "loss": 4.2181, "loss/crossentropy": 1.9348166584968567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930733323097229, "step": 14268 }, { "epoch": 0.2854, "grad_norm": 1.8515625, "grad_norm_var": 0.009650675455729167, "learning_rate": 0.0001, "loss": 4.2039, "loss/crossentropy": 2.056081175804138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20383870601654053, "step": 14270 }, { "epoch": 0.28544, "grad_norm": 2.125, "grad_norm_var": 0.011271158854166666, "learning_rate": 0.0001, "loss": 4.4046, "loss/crossentropy": 2.2752946615219116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2458929866552353, "step": 14272 }, { "epoch": 0.28548, "grad_norm": 2.0625, "grad_norm_var": 0.010456339518229166, "learning_rate": 0.0001, "loss": 4.1693, "loss/crossentropy": 2.093926787376404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19211938977241516, "step": 14274 }, { "epoch": 0.28552, "grad_norm": 2.0, "grad_norm_var": 0.009913889567057292, "learning_rate": 0.0001, "loss": 4.1841, "loss/crossentropy": 2.448614239692688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23177827894687653, "step": 14276 }, { "epoch": 0.28556, "grad_norm": 1.765625, "grad_norm_var": 0.012401326497395834, "learning_rate": 0.0001, "loss": 4.0002, "loss/crossentropy": 2.0047106742858887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2162417769432068, "step": 14278 }, { "epoch": 0.2856, "grad_norm": 2.328125, "grad_norm_var": 0.017986806233723958, "learning_rate": 0.0001, "loss": 4.1669, "loss/crossentropy": 1.982455313205719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22751964628696442, "step": 14280 }, { "epoch": 0.28564, "grad_norm": 2.0625, "grad_norm_var": 0.017411295572916666, "learning_rate": 0.0001, "loss": 4.0355, "loss/crossentropy": 1.872545838356018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19046122580766678, "step": 14282 }, { "epoch": 0.28568, "grad_norm": 2.109375, "grad_norm_var": 0.016388956705729166, "learning_rate": 0.0001, "loss": 4.1266, "loss/crossentropy": 1.8999648690223694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155570462346077, "step": 14284 }, { "epoch": 0.28572, "grad_norm": 2.0, "grad_norm_var": 0.014129384358723959, "learning_rate": 0.0001, "loss": 3.9649, "loss/crossentropy": 1.8433392643928528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18906012177467346, "step": 14286 }, { "epoch": 0.28576, "grad_norm": 2.15625, "grad_norm_var": 0.014697011311848958, "learning_rate": 0.0001, "loss": 4.085, "loss/crossentropy": 1.817060947418213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20473239570856094, "step": 14288 }, { "epoch": 0.2858, "grad_norm": 2.265625, "grad_norm_var": 0.01796239217122396, "learning_rate": 0.0001, "loss": 4.1856, "loss/crossentropy": 2.0344348549842834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19991667568683624, "step": 14290 }, { "epoch": 0.28584, "grad_norm": 1.9609375, "grad_norm_var": 0.01784032185872396, "learning_rate": 0.0001, "loss": 4.2334, "loss/crossentropy": 2.3413926362991333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2365436628460884, "step": 14292 }, { "epoch": 0.28588, "grad_norm": 2.078125, "grad_norm_var": 0.015464019775390626, "learning_rate": 0.0001, "loss": 3.9437, "loss/crossentropy": 2.0959852933883667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19703902304172516, "step": 14294 }, { "epoch": 0.28592, "grad_norm": 7.625, "grad_norm_var": 1.9304239908854166, "learning_rate": 0.0001, "loss": 4.2144, "loss/crossentropy": 2.220101237297058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21996742486953735, "step": 14296 }, { "epoch": 0.28596, "grad_norm": 2.109375, "grad_norm_var": 1.915691884358724, "learning_rate": 0.0001, "loss": 4.1806, "loss/crossentropy": 2.073485493659973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020936980843544, "step": 14298 }, { "epoch": 0.286, "grad_norm": 1.96875, "grad_norm_var": 1.9216265360514322, "learning_rate": 0.0001, "loss": 3.9176, "loss/crossentropy": 2.228920817375183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22304438799619675, "step": 14300 }, { "epoch": 0.28604, "grad_norm": 1.890625, "grad_norm_var": 1.933642323811849, "learning_rate": 0.0001, "loss": 4.0831, "loss/crossentropy": 1.724998950958252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1592913344502449, "step": 14302 }, { "epoch": 0.28608, "grad_norm": 2.0, "grad_norm_var": 1.9375221252441406, "learning_rate": 0.0001, "loss": 4.1738, "loss/crossentropy": 1.8970724940299988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18843455612659454, "step": 14304 }, { "epoch": 0.28612, "grad_norm": 2.09375, "grad_norm_var": 1.9384295145670574, "learning_rate": 0.0001, "loss": 4.0808, "loss/crossentropy": 1.9957273602485657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18272704631090164, "step": 14306 }, { "epoch": 0.28616, "grad_norm": 2.015625, "grad_norm_var": 1.9298177083333334, "learning_rate": 0.0001, "loss": 4.3861, "loss/crossentropy": 2.1424412727355957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21485400944948196, "step": 14308 }, { "epoch": 0.2862, "grad_norm": 1.9765625, "grad_norm_var": 1.9411211649576823, "learning_rate": 0.0001, "loss": 3.9622, "loss/crossentropy": 1.7729334235191345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436155050992966, "step": 14310 }, { "epoch": 0.28624, "grad_norm": 1.96875, "grad_norm_var": 0.013952382405598958, "learning_rate": 0.0001, "loss": 4.2495, "loss/crossentropy": 2.5863327980041504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24598564952611923, "step": 14312 }, { "epoch": 0.28628, "grad_norm": 1.9375, "grad_norm_var": 0.01014404296875, "learning_rate": 0.0001, "loss": 3.962, "loss/crossentropy": 1.9089832305908203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905580461025238, "step": 14314 }, { "epoch": 0.28632, "grad_norm": 2.09375, "grad_norm_var": 0.010228474934895834, "learning_rate": 0.0001, "loss": 4.2167, "loss/crossentropy": 2.0153337121009827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21556401252746582, "step": 14316 }, { "epoch": 0.28636, "grad_norm": 2.015625, "grad_norm_var": 0.008776601155598958, "learning_rate": 0.0001, "loss": 4.4037, "loss/crossentropy": 1.9692147970199585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17990782111883163, "step": 14318 }, { "epoch": 0.2864, "grad_norm": 1.8515625, "grad_norm_var": 0.010969034830729167, "learning_rate": 0.0001, "loss": 4.1049, "loss/crossentropy": 2.0404393076896667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108697146177292, "step": 14320 }, { "epoch": 0.28644, "grad_norm": 2.0, "grad_norm_var": 0.009488932291666667, "learning_rate": 0.0001, "loss": 4.2809, "loss/crossentropy": 2.133711099624634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942032128572464, "step": 14322 }, { "epoch": 0.28648, "grad_norm": 2.0, "grad_norm_var": 0.004878489176432291, "learning_rate": 0.0001, "loss": 4.1605, "loss/crossentropy": 1.9240726232528687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18174728006124496, "step": 14324 }, { "epoch": 0.28652, "grad_norm": 2.0625, "grad_norm_var": 0.00850830078125, "learning_rate": 0.0001, "loss": 4.2671, "loss/crossentropy": 2.0952632427215576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20621410757303238, "step": 14326 }, { "epoch": 0.28656, "grad_norm": 2.046875, "grad_norm_var": 0.008326975504557292, "learning_rate": 0.0001, "loss": 4.0683, "loss/crossentropy": 2.2604600191116333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2310045212507248, "step": 14328 }, { "epoch": 0.2866, "grad_norm": 2.09375, "grad_norm_var": 0.00777587890625, "learning_rate": 0.0001, "loss": 4.2551, "loss/crossentropy": 2.0915167331695557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23344147205352783, "step": 14330 }, { "epoch": 0.28664, "grad_norm": 1.9765625, "grad_norm_var": 0.00804443359375, "learning_rate": 0.0001, "loss": 3.9575, "loss/crossentropy": 2.2928651571273804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23840509355068207, "step": 14332 }, { "epoch": 0.28668, "grad_norm": 2.0, "grad_norm_var": 0.008685048421223958, "learning_rate": 0.0001, "loss": 4.4469, "loss/crossentropy": 2.261072874069214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22056513279676437, "step": 14334 }, { "epoch": 0.28672, "grad_norm": 1.9296875, "grad_norm_var": 0.007883453369140625, "learning_rate": 0.0001, "loss": 3.9218, "loss/crossentropy": 1.7591394186019897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900082305073738, "step": 14336 }, { "epoch": 0.28676, "grad_norm": 1.9296875, "grad_norm_var": 0.008426666259765625, "learning_rate": 0.0001, "loss": 4.0457, "loss/crossentropy": 2.129893183708191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19578705728054047, "step": 14338 }, { "epoch": 0.2868, "grad_norm": 2.078125, "grad_norm_var": 0.008451334635416667, "learning_rate": 0.0001, "loss": 4.4884, "loss/crossentropy": 2.0376795530319214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088106870651245, "step": 14340 }, { "epoch": 0.28684, "grad_norm": 1.9921875, "grad_norm_var": 0.005018870035807292, "learning_rate": 0.0001, "loss": 4.282, "loss/crossentropy": 2.1484848260879517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113661915063858, "step": 14342 }, { "epoch": 0.28688, "grad_norm": 1.984375, "grad_norm_var": 0.00496826171875, "learning_rate": 0.0001, "loss": 4.2713, "loss/crossentropy": 2.204781651496887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20908530056476593, "step": 14344 }, { "epoch": 0.28692, "grad_norm": 2.09375, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 4.1075, "loss/crossentropy": 1.8096813559532166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18912722170352936, "step": 14346 }, { "epoch": 0.28696, "grad_norm": 1.921875, "grad_norm_var": 0.005580393473307291, "learning_rate": 0.0001, "loss": 4.0057, "loss/crossentropy": 1.8014967441558838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18832575529813766, "step": 14348 }, { "epoch": 0.287, "grad_norm": 2.203125, "grad_norm_var": 0.00789794921875, "learning_rate": 0.0001, "loss": 4.3211, "loss/crossentropy": 2.1443156003952026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.232828289270401, "step": 14350 }, { "epoch": 0.28704, "grad_norm": 2.125, "grad_norm_var": 0.008259073893229166, "learning_rate": 0.0001, "loss": 4.0984, "loss/crossentropy": 2.1773927211761475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19964880496263504, "step": 14352 }, { "epoch": 0.28708, "grad_norm": 2.0, "grad_norm_var": 0.007795206705729167, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 2.157313585281372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149301916360855, "step": 14354 }, { "epoch": 0.28712, "grad_norm": 2.046875, "grad_norm_var": 0.007184855143229167, "learning_rate": 0.0001, "loss": 4.2938, "loss/crossentropy": 2.3431146144866943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2297225296497345, "step": 14356 }, { "epoch": 0.28716, "grad_norm": 2.03125, "grad_norm_var": 0.007940419514973958, "learning_rate": 0.0001, "loss": 4.2026, "loss/crossentropy": 1.894954264163971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19812744110822678, "step": 14358 }, { "epoch": 0.2872, "grad_norm": 1.9375, "grad_norm_var": 0.008784993489583334, "learning_rate": 0.0001, "loss": 4.0654, "loss/crossentropy": 2.1701435446739197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21183644235134125, "step": 14360 }, { "epoch": 0.28724, "grad_norm": 2.015625, "grad_norm_var": 0.007975006103515625, "learning_rate": 0.0001, "loss": 4.2457, "loss/crossentropy": 2.202796459197998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21229087561368942, "step": 14362 }, { "epoch": 0.28728, "grad_norm": 2.203125, "grad_norm_var": 0.009789784749348959, "learning_rate": 0.0001, "loss": 4.3692, "loss/crossentropy": 2.225682258605957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22512775659561157, "step": 14364 }, { "epoch": 0.28732, "grad_norm": 1.9453125, "grad_norm_var": 0.007088216145833334, "learning_rate": 0.0001, "loss": 4.0938, "loss/crossentropy": 1.9562655687332153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19557978957891464, "step": 14366 }, { "epoch": 0.28736, "grad_norm": 2.0, "grad_norm_var": 0.0062945048014322914, "learning_rate": 0.0001, "loss": 4.2922, "loss/crossentropy": 1.794309377670288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851721778512001, "step": 14368 }, { "epoch": 0.2874, "grad_norm": 2.0, "grad_norm_var": 0.006660970052083334, "learning_rate": 0.0001, "loss": 4.1198, "loss/crossentropy": 2.276759445667267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071758210659027, "step": 14370 }, { "epoch": 0.28744, "grad_norm": 1.90625, "grad_norm_var": 0.006696573893229167, "learning_rate": 0.0001, "loss": 4.1162, "loss/crossentropy": 2.332028031349182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22525647282600403, "step": 14372 }, { "epoch": 0.28748, "grad_norm": 2.078125, "grad_norm_var": 0.005997721354166667, "learning_rate": 0.0001, "loss": 4.2594, "loss/crossentropy": 1.975858986377716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18405582010746002, "step": 14374 }, { "epoch": 0.28752, "grad_norm": 1.984375, "grad_norm_var": 0.006453196207682292, "learning_rate": 0.0001, "loss": 4.0615, "loss/crossentropy": 2.002572774887085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19543466717004776, "step": 14376 }, { "epoch": 0.28756, "grad_norm": 1.96875, "grad_norm_var": 0.006703440348307292, "learning_rate": 0.0001, "loss": 4.1837, "loss/crossentropy": 2.400219678878784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23009777069091797, "step": 14378 }, { "epoch": 0.2876, "grad_norm": 2.0, "grad_norm_var": 0.003352864583333333, "learning_rate": 0.0001, "loss": 4.0071, "loss/crossentropy": 1.6437376737594604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17785440385341644, "step": 14380 }, { "epoch": 0.28764, "grad_norm": 2.15625, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 1.8833640813827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22255511581897736, "step": 14382 }, { "epoch": 0.28768, "grad_norm": 2.0, "grad_norm_var": 0.007100423177083333, "learning_rate": 0.0001, "loss": 4.0881, "loss/crossentropy": 1.9113904237747192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050003483891487, "step": 14384 }, { "epoch": 0.28772, "grad_norm": 1.90625, "grad_norm_var": 0.007765452067057292, "learning_rate": 0.0001, "loss": 4.1114, "loss/crossentropy": 2.236249327659607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21349424868822098, "step": 14386 }, { "epoch": 0.28776, "grad_norm": 2.140625, "grad_norm_var": 0.008090972900390625, "learning_rate": 0.0001, "loss": 3.9282, "loss/crossentropy": 1.8709489703178406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19309207051992416, "step": 14388 }, { "epoch": 0.2878, "grad_norm": 1.9765625, "grad_norm_var": 0.007933553059895833, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 1.926324725151062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19902244210243225, "step": 14390 }, { "epoch": 0.28784, "grad_norm": 1.8671875, "grad_norm_var": 0.00810546875, "learning_rate": 0.0001, "loss": 4.085, "loss/crossentropy": 1.898009479045868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698234856128693, "step": 14392 }, { "epoch": 0.28788, "grad_norm": 2.1875, "grad_norm_var": 0.0122711181640625, "learning_rate": 0.0001, "loss": 4.2367, "loss/crossentropy": 1.9728660583496094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033083364367485, "step": 14394 }, { "epoch": 0.28792, "grad_norm": 1.9296875, "grad_norm_var": 0.01282958984375, "learning_rate": 0.0001, "loss": 3.7965, "loss/crossentropy": 1.957255482673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20039385557174683, "step": 14396 }, { "epoch": 0.28796, "grad_norm": 2.0625, "grad_norm_var": 0.009897613525390625, "learning_rate": 0.0001, "loss": 3.8525, "loss/crossentropy": 1.8001562356948853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059212401509285, "step": 14398 }, { "epoch": 0.288, "grad_norm": 2.125, "grad_norm_var": 0.011177571614583333, "learning_rate": 0.0001, "loss": 4.1749, "loss/crossentropy": 1.9166946411132812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19456689059734344, "step": 14400 }, { "epoch": 0.28804, "grad_norm": 2.0625, "grad_norm_var": 0.01221923828125, "learning_rate": 0.0001, "loss": 4.3302, "loss/crossentropy": 2.033670485019684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24448300898075104, "step": 14402 }, { "epoch": 0.28808, "grad_norm": 1.9140625, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 3.8679, "loss/crossentropy": 2.2668023705482483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20903322845697403, "step": 14404 }, { "epoch": 0.28812, "grad_norm": 1.9453125, "grad_norm_var": 0.012532297770182292, "learning_rate": 0.0001, "loss": 4.0638, "loss/crossentropy": 2.1076024770736694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20488093793392181, "step": 14406 }, { "epoch": 0.28816, "grad_norm": 2.078125, "grad_norm_var": 0.012117258707682292, "learning_rate": 0.0001, "loss": 4.0707, "loss/crossentropy": 1.996046781539917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20725766569375992, "step": 14408 }, { "epoch": 0.2882, "grad_norm": 2.046875, "grad_norm_var": 0.008365885416666666, "learning_rate": 0.0001, "loss": 4.2302, "loss/crossentropy": 2.2031015157699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19491394609212875, "step": 14410 }, { "epoch": 0.28824, "grad_norm": 2.109375, "grad_norm_var": 0.008714803059895833, "learning_rate": 0.0001, "loss": 4.3962, "loss/crossentropy": 2.0339081287384033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21979261189699173, "step": 14412 }, { "epoch": 0.28828, "grad_norm": 1.8515625, "grad_norm_var": 0.011641438802083333, "learning_rate": 0.0001, "loss": 4.0042, "loss/crossentropy": 1.9283286929130554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19970305263996124, "step": 14414 }, { "epoch": 0.28832, "grad_norm": 2.046875, "grad_norm_var": 0.011156972249348958, "learning_rate": 0.0001, "loss": 4.3879, "loss/crossentropy": 2.1092429161071777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2114599123597145, "step": 14416 }, { "epoch": 0.28836, "grad_norm": 1.9765625, "grad_norm_var": 0.01004638671875, "learning_rate": 0.0001, "loss": 4.1234, "loss/crossentropy": 1.8102002143859863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18407940864562988, "step": 14418 }, { "epoch": 0.2884, "grad_norm": 1.8984375, "grad_norm_var": 0.008906809488932292, "learning_rate": 0.0001, "loss": 4.0943, "loss/crossentropy": 2.3192719221115112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22208110988140106, "step": 14420 }, { "epoch": 0.28844, "grad_norm": 1.9921875, "grad_norm_var": 0.0082183837890625, "learning_rate": 0.0001, "loss": 4.2268, "loss/crossentropy": 2.2302430868148804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21011962741613388, "step": 14422 }, { "epoch": 0.28848, "grad_norm": 1.9765625, "grad_norm_var": 0.008542633056640625, "learning_rate": 0.0001, "loss": 4.1235, "loss/crossentropy": 2.3015987873077393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012081891298294, "step": 14424 }, { "epoch": 0.28852, "grad_norm": 1.9921875, "grad_norm_var": 0.0077056884765625, "learning_rate": 0.0001, "loss": 4.3094, "loss/crossentropy": 2.060473084449768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20174618810415268, "step": 14426 }, { "epoch": 0.28856, "grad_norm": 2.3125, "grad_norm_var": 0.014082845052083333, "learning_rate": 0.0001, "loss": 3.9594, "loss/crossentropy": 1.8146299719810486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18100766092538834, "step": 14428 }, { "epoch": 0.2886, "grad_norm": 2.0, "grad_norm_var": 0.011277008056640624, "learning_rate": 0.0001, "loss": 3.8444, "loss/crossentropy": 2.1504935026168823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20493116229772568, "step": 14430 }, { "epoch": 0.28864, "grad_norm": 2.0, "grad_norm_var": 0.010796864827473959, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.4947997331619263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2339334562420845, "step": 14432 }, { "epoch": 0.28868, "grad_norm": 2.3125, "grad_norm_var": 0.01612116495768229, "learning_rate": 0.0001, "loss": 4.3011, "loss/crossentropy": 2.421883702278137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23815617710351944, "step": 14434 }, { "epoch": 0.28872, "grad_norm": 2.0, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 4.2152, "loss/crossentropy": 2.093776822090149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20425530523061752, "step": 14436 }, { "epoch": 0.28876, "grad_norm": 2.03125, "grad_norm_var": 0.01817601521809896, "learning_rate": 0.0001, "loss": 3.9126, "loss/crossentropy": 2.02763295173645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972591131925583, "step": 14438 }, { "epoch": 0.2888, "grad_norm": 2.015625, "grad_norm_var": 0.0176422119140625, "learning_rate": 0.0001, "loss": 4.1348, "loss/crossentropy": 1.9769265055656433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186862014234066, "step": 14440 }, { "epoch": 0.28884, "grad_norm": 1.9375, "grad_norm_var": 0.019024403889973958, "learning_rate": 0.0001, "loss": 4.4997, "loss/crossentropy": 2.34747314453125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22067507356405258, "step": 14442 }, { "epoch": 0.28888, "grad_norm": 1.921875, "grad_norm_var": 0.0128570556640625, "learning_rate": 0.0001, "loss": 4.1497, "loss/crossentropy": 2.1036205887794495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2231181040406227, "step": 14444 }, { "epoch": 0.28892, "grad_norm": 1.9140625, "grad_norm_var": 0.01638768513997396, "learning_rate": 0.0001, "loss": 4.251, "loss/crossentropy": 2.2350860834121704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23475627601146698, "step": 14446 }, { "epoch": 0.28896, "grad_norm": 2.09375, "grad_norm_var": 0.01778132120768229, "learning_rate": 0.0001, "loss": 4.2095, "loss/crossentropy": 2.2336645126342773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20643477141857147, "step": 14448 }, { "epoch": 0.289, "grad_norm": 1.9765625, "grad_norm_var": 0.012923177083333333, "learning_rate": 0.0001, "loss": 4.0793, "loss/crossentropy": 2.345365524291992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20650553703308105, "step": 14450 }, { "epoch": 0.28904, "grad_norm": 2.125, "grad_norm_var": 0.0120025634765625, "learning_rate": 0.0001, "loss": 4.131, "loss/crossentropy": 2.307543635368347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21354079991579056, "step": 14452 }, { "epoch": 0.28908, "grad_norm": 1.9921875, "grad_norm_var": 0.012001291910807291, "learning_rate": 0.0001, "loss": 4.1006, "loss/crossentropy": 1.8094390034675598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1850133240222931, "step": 14454 }, { "epoch": 0.28912, "grad_norm": 2.046875, "grad_norm_var": 0.011424763997395834, "learning_rate": 0.0001, "loss": 4.1326, "loss/crossentropy": 1.859872817993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.226328507065773, "step": 14456 }, { "epoch": 0.28916, "grad_norm": 1.9921875, "grad_norm_var": 0.009340159098307292, "learning_rate": 0.0001, "loss": 3.9997, "loss/crossentropy": 1.9507490396499634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18718113750219345, "step": 14458 }, { "epoch": 0.2892, "grad_norm": 1.859375, "grad_norm_var": 0.01126708984375, "learning_rate": 0.0001, "loss": 3.7868, "loss/crossentropy": 1.6642532348632812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1674812287092209, "step": 14460 }, { "epoch": 0.28924, "grad_norm": 1.859375, "grad_norm_var": 0.007039133707682292, "learning_rate": 0.0001, "loss": 3.9523, "loss/crossentropy": 1.9071928262710571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18480891734361649, "step": 14462 }, { "epoch": 0.28928, "grad_norm": 1.96875, "grad_norm_var": 0.005718739827473959, "learning_rate": 0.0001, "loss": 4.2876, "loss/crossentropy": 2.0758568048477173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002219259738922, "step": 14464 }, { "epoch": 0.28932, "grad_norm": 2.078125, "grad_norm_var": 0.006769816080729167, "learning_rate": 0.0001, "loss": 3.9438, "loss/crossentropy": 1.9777529835700989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19538866728544235, "step": 14466 }, { "epoch": 0.28936, "grad_norm": 2.0625, "grad_norm_var": 0.0062652587890625, "learning_rate": 0.0001, "loss": 4.1262, "loss/crossentropy": 2.008077323436737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20578377693891525, "step": 14468 }, { "epoch": 0.2894, "grad_norm": 1.8359375, "grad_norm_var": 0.006673177083333333, "learning_rate": 0.0001, "loss": 3.9341, "loss/crossentropy": 2.011266529560089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20298370718955994, "step": 14470 }, { "epoch": 0.28944, "grad_norm": 1.9375, "grad_norm_var": 0.0057769775390625, "learning_rate": 0.0001, "loss": 4.201, "loss/crossentropy": 2.231672167778015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21847998350858688, "step": 14472 }, { "epoch": 0.28948, "grad_norm": 1.984375, "grad_norm_var": 0.007279205322265625, "learning_rate": 0.0001, "loss": 4.382, "loss/crossentropy": 2.2268325090408325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22168640047311783, "step": 14474 }, { "epoch": 0.28952, "grad_norm": 1.9921875, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 2.2705942392349243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110700011253357, "step": 14476 }, { "epoch": 0.28956, "grad_norm": 1.8203125, "grad_norm_var": 0.012566884358723959, "learning_rate": 0.0001, "loss": 3.8754, "loss/crossentropy": 1.893052339553833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18865809589624405, "step": 14478 }, { "epoch": 0.2896, "grad_norm": 1.84375, "grad_norm_var": 0.013732655843098959, "learning_rate": 0.0001, "loss": 4.0989, "loss/crossentropy": 2.020704984664917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18822921812534332, "step": 14480 }, { "epoch": 0.28964, "grad_norm": 2.03125, "grad_norm_var": 0.014632161458333333, "learning_rate": 0.0001, "loss": 4.1651, "loss/crossentropy": 1.762313961982727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204545259475708, "step": 14482 }, { "epoch": 0.28968, "grad_norm": 1.9921875, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 4.0261, "loss/crossentropy": 1.8793032765388489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17913836985826492, "step": 14484 }, { "epoch": 0.28972, "grad_norm": 1.890625, "grad_norm_var": 0.013372548421223958, "learning_rate": 0.0001, "loss": 4.2731, "loss/crossentropy": 2.428762197494507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23766332119703293, "step": 14486 }, { "epoch": 0.28976, "grad_norm": 2.046875, "grad_norm_var": 0.014969635009765624, "learning_rate": 0.0001, "loss": 4.1134, "loss/crossentropy": 2.2160138487815857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21619082987308502, "step": 14488 }, { "epoch": 0.2898, "grad_norm": 2.046875, "grad_norm_var": 0.0150787353515625, "learning_rate": 0.0001, "loss": 4.1049, "loss/crossentropy": 1.8169404864311218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981067657470703, "step": 14490 }, { "epoch": 0.28984, "grad_norm": 1.7890625, "grad_norm_var": 0.0131591796875, "learning_rate": 0.0001, "loss": 4.1456, "loss/crossentropy": 2.147166609764099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993183195590973, "step": 14492 }, { "epoch": 0.28988, "grad_norm": 1.9609375, "grad_norm_var": 0.011362457275390625, "learning_rate": 0.0001, "loss": 3.8507, "loss/crossentropy": 1.7740440964698792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18381935358047485, "step": 14494 }, { "epoch": 0.28992, "grad_norm": 1.921875, "grad_norm_var": 0.010428619384765626, "learning_rate": 0.0001, "loss": 4.0327, "loss/crossentropy": 2.206219792366028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21052061766386032, "step": 14496 }, { "epoch": 0.28996, "grad_norm": 2.015625, "grad_norm_var": 0.008600870768229166, "learning_rate": 0.0001, "loss": 4.4421, "loss/crossentropy": 2.4030661582946777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22094716131687164, "step": 14498 }, { "epoch": 0.29, "grad_norm": 2.25, "grad_norm_var": 0.016633097330729166, "learning_rate": 0.0001, "loss": 4.4593, "loss/crossentropy": 2.142518997192383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111787647008896, "step": 14500 }, { "epoch": 0.29004, "grad_norm": 1.953125, "grad_norm_var": 0.015633138020833333, "learning_rate": 0.0001, "loss": 4.3679, "loss/crossentropy": 1.9228865504264832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19389715045690536, "step": 14502 }, { "epoch": 0.29008, "grad_norm": 2.09375, "grad_norm_var": 0.014141591389973958, "learning_rate": 0.0001, "loss": 4.011, "loss/crossentropy": 1.789183259010315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18835698068141937, "step": 14504 }, { "epoch": 0.29012, "grad_norm": 1.9375, "grad_norm_var": 0.013798014322916666, "learning_rate": 0.0001, "loss": 4.2862, "loss/crossentropy": 2.3876765966415405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23156627267599106, "step": 14506 }, { "epoch": 0.29016, "grad_norm": 2.078125, "grad_norm_var": 0.010151926676432292, "learning_rate": 0.0001, "loss": 4.2024, "loss/crossentropy": 2.055350124835968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995788812637329, "step": 14508 }, { "epoch": 0.2902, "grad_norm": 1.96875, "grad_norm_var": 0.010007476806640625, "learning_rate": 0.0001, "loss": 4.343, "loss/crossentropy": 2.2045267820358276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2197813093662262, "step": 14510 }, { "epoch": 0.29024, "grad_norm": 1.765625, "grad_norm_var": 0.014503733317057291, "learning_rate": 0.0001, "loss": 3.7136, "loss/crossentropy": 1.9596665501594543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17329587787389755, "step": 14512 }, { "epoch": 0.29028, "grad_norm": 1.9921875, "grad_norm_var": 0.017465972900390626, "learning_rate": 0.0001, "loss": 3.9546, "loss/crossentropy": 1.8273005485534668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17768846452236176, "step": 14514 }, { "epoch": 0.29032, "grad_norm": 1.8359375, "grad_norm_var": 0.011739095052083334, "learning_rate": 0.0001, "loss": 3.9924, "loss/crossentropy": 2.0428807735443115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104019746184349, "step": 14516 }, { "epoch": 0.29036, "grad_norm": 2.21875, "grad_norm_var": 0.014216868082682292, "learning_rate": 0.0001, "loss": 4.3591, "loss/crossentropy": 2.1242733001708984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2153768166899681, "step": 14518 }, { "epoch": 0.2904, "grad_norm": 1.875, "grad_norm_var": 0.013944244384765625, "learning_rate": 0.0001, "loss": 4.0831, "loss/crossentropy": 1.9223615527153015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18382133543491364, "step": 14520 }, { "epoch": 0.29044, "grad_norm": 1.9453125, "grad_norm_var": 0.013993326822916667, "learning_rate": 0.0001, "loss": 3.9527, "loss/crossentropy": 2.12492972612381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19818739593029022, "step": 14522 }, { "epoch": 0.29048, "grad_norm": 1.9609375, "grad_norm_var": 0.013665517171223959, "learning_rate": 0.0001, "loss": 4.1539, "loss/crossentropy": 2.0251020789146423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19633153080940247, "step": 14524 }, { "epoch": 0.29052, "grad_norm": 2.1875, "grad_norm_var": 0.014949544270833334, "learning_rate": 0.0001, "loss": 4.192, "loss/crossentropy": 2.177084445953369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22618898749351501, "step": 14526 }, { "epoch": 0.29056, "grad_norm": 2.0625, "grad_norm_var": 0.013997141520182292, "learning_rate": 0.0001, "loss": 3.9215, "loss/crossentropy": 1.718012809753418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18632279336452484, "step": 14528 }, { "epoch": 0.2906, "grad_norm": 1.9609375, "grad_norm_var": 0.017850494384765624, "learning_rate": 0.0001, "loss": 4.2869, "loss/crossentropy": 1.7026863098144531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.211147278547287, "step": 14530 }, { "epoch": 0.29064, "grad_norm": 1.8671875, "grad_norm_var": 0.017682902018229165, "learning_rate": 0.0001, "loss": 3.9369, "loss/crossentropy": 1.6652680039405823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16967280209064484, "step": 14532 }, { "epoch": 0.29068, "grad_norm": 1.9296875, "grad_norm_var": 0.013996378580729166, "learning_rate": 0.0001, "loss": 4.0815, "loss/crossentropy": 2.191527247428894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20215752720832825, "step": 14534 }, { "epoch": 0.29072, "grad_norm": 2.125, "grad_norm_var": 0.015927886962890624, "learning_rate": 0.0001, "loss": 3.8817, "loss/crossentropy": 1.8385645747184753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19906776398420334, "step": 14536 }, { "epoch": 0.29076, "grad_norm": 1.8671875, "grad_norm_var": 0.01665013631184896, "learning_rate": 0.0001, "loss": 3.9186, "loss/crossentropy": 2.0456249117851257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2057228460907936, "step": 14538 }, { "epoch": 0.2908, "grad_norm": 1.84375, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 4.0636, "loss/crossentropy": 2.0413625836372375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20096366852521896, "step": 14540 }, { "epoch": 0.29084, "grad_norm": 1.9609375, "grad_norm_var": 0.016755167643229166, "learning_rate": 0.0001, "loss": 3.7819, "loss/crossentropy": 1.723297357559204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.180135078728199, "step": 14542 }, { "epoch": 0.29088, "grad_norm": 2.078125, "grad_norm_var": 0.01578343709309896, "learning_rate": 0.0001, "loss": 4.2115, "loss/crossentropy": 2.3143200874328613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2289530113339424, "step": 14544 }, { "epoch": 0.29092, "grad_norm": 2.046875, "grad_norm_var": 0.010017903645833333, "learning_rate": 0.0001, "loss": 4.408, "loss/crossentropy": 2.1612138748168945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21224550902843475, "step": 14546 }, { "epoch": 0.29096, "grad_norm": 1.921875, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.1101, "loss/crossentropy": 2.0591673851013184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18913161754608154, "step": 14548 }, { "epoch": 0.291, "grad_norm": 1.875, "grad_norm_var": 0.009505208333333333, "learning_rate": 0.0001, "loss": 4.132, "loss/crossentropy": 2.0901013016700745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19315654039382935, "step": 14550 }, { "epoch": 0.29104, "grad_norm": 1.9296875, "grad_norm_var": 0.006833648681640625, "learning_rate": 0.0001, "loss": 3.9351, "loss/crossentropy": 2.0716471672058105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951451674103737, "step": 14552 }, { "epoch": 0.29108, "grad_norm": 1.984375, "grad_norm_var": 0.008186848958333333, "learning_rate": 0.0001, "loss": 4.3656, "loss/crossentropy": 2.084408760070801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21354945749044418, "step": 14554 }, { "epoch": 0.29112, "grad_norm": 1.9453125, "grad_norm_var": 0.006486002604166667, "learning_rate": 0.0001, "loss": 4.0342, "loss/crossentropy": 2.154771566390991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20277246832847595, "step": 14556 }, { "epoch": 0.29116, "grad_norm": 2.09375, "grad_norm_var": 0.0069353739420572914, "learning_rate": 0.0001, "loss": 4.259, "loss/crossentropy": 2.24834144115448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23204077035188675, "step": 14558 }, { "epoch": 0.2912, "grad_norm": 1.859375, "grad_norm_var": 0.006951649983723958, "learning_rate": 0.0001, "loss": 3.8054, "loss/crossentropy": 1.8239850401878357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18208947032690048, "step": 14560 }, { "epoch": 0.29124, "grad_norm": 1.90625, "grad_norm_var": 0.006701405843098958, "learning_rate": 0.0001, "loss": 3.9465, "loss/crossentropy": 1.969277262687683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093023955821991, "step": 14562 }, { "epoch": 0.29128, "grad_norm": 2.046875, "grad_norm_var": 0.007670084635416667, "learning_rate": 0.0001, "loss": 3.7176, "loss/crossentropy": 1.561119556427002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1653747707605362, "step": 14564 }, { "epoch": 0.29132, "grad_norm": 2.0625, "grad_norm_var": 0.008154042561848958, "learning_rate": 0.0001, "loss": 4.3307, "loss/crossentropy": 2.2341216802597046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23967482149600983, "step": 14566 }, { "epoch": 0.29136, "grad_norm": 1.921875, "grad_norm_var": 0.008227284749348958, "learning_rate": 0.0001, "loss": 3.9899, "loss/crossentropy": 1.999854326248169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19598717242479324, "step": 14568 }, { "epoch": 0.2914, "grad_norm": 1.953125, "grad_norm_var": 0.006898752848307292, "learning_rate": 0.0001, "loss": 4.1262, "loss/crossentropy": 1.8079020380973816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919228658080101, "step": 14570 }, { "epoch": 0.29144, "grad_norm": 2.109375, "grad_norm_var": 0.009970855712890626, "learning_rate": 0.0001, "loss": 3.9946, "loss/crossentropy": 2.044901430606842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21737617254257202, "step": 14572 }, { "epoch": 0.29148, "grad_norm": 2.03125, "grad_norm_var": 0.009806060791015625, "learning_rate": 0.0001, "loss": 4.0498, "loss/crossentropy": 1.972197949886322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20828764885663986, "step": 14574 }, { "epoch": 0.29152, "grad_norm": 2.015625, "grad_norm_var": 0.010131581624348959, "learning_rate": 0.0001, "loss": 4.0011, "loss/crossentropy": 1.9129782915115356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17108886688947678, "step": 14576 }, { "epoch": 0.29156, "grad_norm": 2.109375, "grad_norm_var": 0.01068115234375, "learning_rate": 0.0001, "loss": 4.1458, "loss/crossentropy": 2.0545085072517395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20631857216358185, "step": 14578 }, { "epoch": 0.2916, "grad_norm": 2.015625, "grad_norm_var": 0.012654368082682292, "learning_rate": 0.0001, "loss": 4.355, "loss/crossentropy": 2.0274637937545776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2099764049053192, "step": 14580 }, { "epoch": 0.29164, "grad_norm": 1.875, "grad_norm_var": 0.013765207926432292, "learning_rate": 0.0001, "loss": 3.9014, "loss/crossentropy": 1.8154139518737793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19045238196849823, "step": 14582 }, { "epoch": 0.29168, "grad_norm": 2.0625, "grad_norm_var": 0.014937082926432291, "learning_rate": 0.0001, "loss": 4.0327, "loss/crossentropy": 2.061666965484619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21272550523281097, "step": 14584 }, { "epoch": 0.29172, "grad_norm": 2.03125, "grad_norm_var": 0.01580785115559896, "learning_rate": 0.0001, "loss": 4.6019, "loss/crossentropy": 2.3831146955490112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22327116876840591, "step": 14586 }, { "epoch": 0.29176, "grad_norm": 2.046875, "grad_norm_var": 0.013090006510416667, "learning_rate": 0.0001, "loss": 4.2142, "loss/crossentropy": 1.8024229407310486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1871618777513504, "step": 14588 }, { "epoch": 0.2918, "grad_norm": 2.0625, "grad_norm_var": 0.014967600504557291, "learning_rate": 0.0001, "loss": 4.1611, "loss/crossentropy": 2.289118528366089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22116435319185257, "step": 14590 }, { "epoch": 0.29184, "grad_norm": 1.9765625, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 4.1358, "loss/crossentropy": 2.11151522397995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22029711306095123, "step": 14592 }, { "epoch": 0.29188, "grad_norm": 2.125, "grad_norm_var": 0.013288370768229167, "learning_rate": 0.0001, "loss": 4.3103, "loss/crossentropy": 1.7856897711753845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815958321094513, "step": 14594 }, { "epoch": 0.29192, "grad_norm": 2.078125, "grad_norm_var": 0.0101959228515625, "learning_rate": 0.0001, "loss": 4.1085, "loss/crossentropy": 1.9745690822601318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074003368616104, "step": 14596 }, { "epoch": 0.29196, "grad_norm": 1.953125, "grad_norm_var": 0.0091461181640625, "learning_rate": 0.0001, "loss": 4.3034, "loss/crossentropy": 1.9199401140213013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19998926669359207, "step": 14598 }, { "epoch": 0.292, "grad_norm": 1.953125, "grad_norm_var": 0.007743072509765625, "learning_rate": 0.0001, "loss": 4.0842, "loss/crossentropy": 1.9938844442367554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20138052105903625, "step": 14600 }, { "epoch": 0.29204, "grad_norm": 1.9765625, "grad_norm_var": 0.0059397379557291664, "learning_rate": 0.0001, "loss": 4.0025, "loss/crossentropy": 1.6290993094444275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18716775625944138, "step": 14602 }, { "epoch": 0.29208, "grad_norm": 1.875, "grad_norm_var": 0.006115468343098959, "learning_rate": 0.0001, "loss": 3.9953, "loss/crossentropy": 2.116807520389557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113799750804901, "step": 14604 }, { "epoch": 0.29212, "grad_norm": 2.140625, "grad_norm_var": 0.006392415364583333, "learning_rate": 0.0001, "loss": 4.0527, "loss/crossentropy": 1.6732030510902405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19776474684476852, "step": 14606 }, { "epoch": 0.29216, "grad_norm": 2.03125, "grad_norm_var": 0.006762440999348958, "learning_rate": 0.0001, "loss": 4.2572, "loss/crossentropy": 1.9403682351112366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18560070544481277, "step": 14608 }, { "epoch": 0.2922, "grad_norm": 2.3125, "grad_norm_var": 0.011641438802083333, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 1.9337440729141235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172246277332306, "step": 14610 }, { "epoch": 0.29224, "grad_norm": 2.0625, "grad_norm_var": 0.011836751302083334, "learning_rate": 0.0001, "loss": 4.1839, "loss/crossentropy": 2.1603177785873413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076597660779953, "step": 14612 }, { "epoch": 0.29228, "grad_norm": 1.953125, "grad_norm_var": 0.0118804931640625, "learning_rate": 0.0001, "loss": 4.1103, "loss/crossentropy": 1.979960322380066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007075995206833, "step": 14614 }, { "epoch": 0.29232, "grad_norm": 1.9765625, "grad_norm_var": 0.0134429931640625, "learning_rate": 0.0001, "loss": 3.9901, "loss/crossentropy": 2.1262378096580505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033269852399826, "step": 14616 }, { "epoch": 0.29236, "grad_norm": 1.953125, "grad_norm_var": 0.013327789306640626, "learning_rate": 0.0001, "loss": 3.9001, "loss/crossentropy": 1.836608648300171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17032578587532043, "step": 14618 }, { "epoch": 0.2924, "grad_norm": 1.984375, "grad_norm_var": 0.012473297119140626, "learning_rate": 0.0001, "loss": 4.0165, "loss/crossentropy": 1.784467339515686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187791109085083, "step": 14620 }, { "epoch": 0.29244, "grad_norm": 1.9921875, "grad_norm_var": 0.010196940104166666, "learning_rate": 0.0001, "loss": 4.2963, "loss/crossentropy": 2.5023289918899536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2469349130988121, "step": 14622 }, { "epoch": 0.29248, "grad_norm": 2.1875, "grad_norm_var": 0.0142974853515625, "learning_rate": 0.0001, "loss": 4.3748, "loss/crossentropy": 2.244086742401123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20690800249576569, "step": 14624 }, { "epoch": 0.29252, "grad_norm": 2.109375, "grad_norm_var": 0.008829752604166666, "learning_rate": 0.0001, "loss": 4.2594, "loss/crossentropy": 2.167649209499359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23074676096439362, "step": 14626 }, { "epoch": 0.29256, "grad_norm": 1.9140625, "grad_norm_var": 0.0092437744140625, "learning_rate": 0.0001, "loss": 4.1837, "loss/crossentropy": 1.934467613697052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041843980550766, "step": 14628 }, { "epoch": 0.2926, "grad_norm": 2.03125, "grad_norm_var": 0.0152740478515625, "learning_rate": 0.0001, "loss": 4.3277, "loss/crossentropy": 2.457708954811096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24325969815254211, "step": 14630 }, { "epoch": 0.29264, "grad_norm": 2.015625, "grad_norm_var": 0.014446767171223958, "learning_rate": 0.0001, "loss": 4.0523, "loss/crossentropy": 2.165565609931946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20386752486228943, "step": 14632 }, { "epoch": 0.29268, "grad_norm": 2.0625, "grad_norm_var": 0.013677724202473958, "learning_rate": 0.0001, "loss": 4.0767, "loss/crossentropy": 1.9475921988487244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2017974555492401, "step": 14634 }, { "epoch": 0.29272, "grad_norm": 1.8203125, "grad_norm_var": 0.015819295247395834, "learning_rate": 0.0001, "loss": 4.1435, "loss/crossentropy": 1.95538991689682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19920305162668228, "step": 14636 }, { "epoch": 0.29276, "grad_norm": 2.234375, "grad_norm_var": 0.01803766886393229, "learning_rate": 0.0001, "loss": 4.4073, "loss/crossentropy": 2.1813108921051025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130545824766159, "step": 14638 }, { "epoch": 0.2928, "grad_norm": 1.8515625, "grad_norm_var": 0.01672337849934896, "learning_rate": 0.0001, "loss": 4.1841, "loss/crossentropy": 2.198926568031311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21660766005516052, "step": 14640 }, { "epoch": 0.29284, "grad_norm": 2.015625, "grad_norm_var": 0.01692682902018229, "learning_rate": 0.0001, "loss": 4.2872, "loss/crossentropy": 2.244659662246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22478342056274414, "step": 14642 }, { "epoch": 0.29288, "grad_norm": 1.984375, "grad_norm_var": 0.01608861287434896, "learning_rate": 0.0001, "loss": 4.234, "loss/crossentropy": 2.1159621477127075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22023546695709229, "step": 14644 }, { "epoch": 0.29292, "grad_norm": 1.859375, "grad_norm_var": 0.011067708333333334, "learning_rate": 0.0001, "loss": 3.9166, "loss/crossentropy": 2.2336114645004272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20683540403842926, "step": 14646 }, { "epoch": 0.29296, "grad_norm": 2.078125, "grad_norm_var": 0.011128743489583334, "learning_rate": 0.0001, "loss": 3.8647, "loss/crossentropy": 1.9015604257583618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2791958376765251, "step": 14648 }, { "epoch": 0.293, "grad_norm": 1.9140625, "grad_norm_var": 0.011180623372395834, "learning_rate": 0.0001, "loss": 3.7325, "loss/crossentropy": 1.6337950229644775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17279411852359772, "step": 14650 }, { "epoch": 0.29304, "grad_norm": 2.0625, "grad_norm_var": 0.009618123372395834, "learning_rate": 0.0001, "loss": 4.111, "loss/crossentropy": 1.920238435268402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18912465870380402, "step": 14652 }, { "epoch": 0.29308, "grad_norm": 2.046875, "grad_norm_var": 0.006075032552083333, "learning_rate": 0.0001, "loss": 4.4346, "loss/crossentropy": 2.5529074668884277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23994534462690353, "step": 14654 }, { "epoch": 0.29312, "grad_norm": 1.953125, "grad_norm_var": 0.0100982666015625, "learning_rate": 0.0001, "loss": 4.3154, "loss/crossentropy": 2.1320537328720093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21480896323919296, "step": 14656 }, { "epoch": 0.29316, "grad_norm": 1.8046875, "grad_norm_var": 0.013205718994140626, "learning_rate": 0.0001, "loss": 3.867, "loss/crossentropy": 1.8921156525611877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1749916449189186, "step": 14658 }, { "epoch": 0.2932, "grad_norm": 1.9140625, "grad_norm_var": 0.015433502197265626, "learning_rate": 0.0001, "loss": 3.7405, "loss/crossentropy": 2.2475300431251526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20675917714834213, "step": 14660 }, { "epoch": 0.29324, "grad_norm": 2.046875, "grad_norm_var": 0.015384928385416666, "learning_rate": 0.0001, "loss": 4.2091, "loss/crossentropy": 2.291227698326111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244400456547737, "step": 14662 }, { "epoch": 0.29328, "grad_norm": 2.09375, "grad_norm_var": 0.017976888020833335, "learning_rate": 0.0001, "loss": 4.1468, "loss/crossentropy": 1.8830837607383728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137318760156631, "step": 14664 }, { "epoch": 0.29332, "grad_norm": 2.078125, "grad_norm_var": 0.019742838541666665, "learning_rate": 0.0001, "loss": 4.3424, "loss/crossentropy": 2.366846203804016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22360631078481674, "step": 14666 }, { "epoch": 0.29336, "grad_norm": 1.875, "grad_norm_var": 0.021720123291015626, "learning_rate": 0.0001, "loss": 4.1778, "loss/crossentropy": 2.165378987789154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088456228375435, "step": 14668 }, { "epoch": 0.2934, "grad_norm": 1.9609375, "grad_norm_var": 0.021809895833333332, "learning_rate": 0.0001, "loss": 4.178, "loss/crossentropy": 1.782981276512146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17654509842395782, "step": 14670 }, { "epoch": 0.29344, "grad_norm": 2.28125, "grad_norm_var": 0.021581013997395832, "learning_rate": 0.0001, "loss": 4.0796, "loss/crossentropy": 1.9891030192375183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18961766362190247, "step": 14672 }, { "epoch": 0.29348, "grad_norm": 1.9921875, "grad_norm_var": 0.015738932291666667, "learning_rate": 0.0001, "loss": 4.0359, "loss/crossentropy": 1.9350959062576294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124071717262268, "step": 14674 }, { "epoch": 0.29352, "grad_norm": 2.21875, "grad_norm_var": 0.01131591796875, "learning_rate": 0.0001, "loss": 4.2651, "loss/crossentropy": 2.1763634085655212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22027570754289627, "step": 14676 }, { "epoch": 0.29356, "grad_norm": 1.9453125, "grad_norm_var": 0.012499745686848958, "learning_rate": 0.0001, "loss": 4.1502, "loss/crossentropy": 1.741381287574768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17308862507343292, "step": 14678 }, { "epoch": 0.2936, "grad_norm": 2.03125, "grad_norm_var": 0.011633046468098958, "learning_rate": 0.0001, "loss": 4.2012, "loss/crossentropy": 2.352774977684021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20929308235645294, "step": 14680 }, { "epoch": 0.29364, "grad_norm": 2.0625, "grad_norm_var": 0.012679036458333333, "learning_rate": 0.0001, "loss": 3.9799, "loss/crossentropy": 1.6342085003852844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17740759253501892, "step": 14682 }, { "epoch": 0.29368, "grad_norm": 2.109375, "grad_norm_var": 0.0110260009765625, "learning_rate": 0.0001, "loss": 4.1968, "loss/crossentropy": 2.315647602081299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202264443039894, "step": 14684 }, { "epoch": 0.29372, "grad_norm": 2.09375, "grad_norm_var": 0.011860911051432292, "learning_rate": 0.0001, "loss": 4.0276, "loss/crossentropy": 2.1667529344558716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20557944476604462, "step": 14686 }, { "epoch": 0.29376, "grad_norm": 1.8359375, "grad_norm_var": 0.012970987955729167, "learning_rate": 0.0001, "loss": 3.7212, "loss/crossentropy": 1.9426026940345764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17865119129419327, "step": 14688 }, { "epoch": 0.2938, "grad_norm": 1.9453125, "grad_norm_var": 0.013720703125, "learning_rate": 0.0001, "loss": 3.9345, "loss/crossentropy": 2.012951970100403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918969601392746, "step": 14690 }, { "epoch": 0.29384, "grad_norm": 1.9140625, "grad_norm_var": 0.0097808837890625, "learning_rate": 0.0001, "loss": 4.2131, "loss/crossentropy": 2.2038668394088745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21520675718784332, "step": 14692 }, { "epoch": 0.29388, "grad_norm": 1.8828125, "grad_norm_var": 0.009659576416015624, "learning_rate": 0.0001, "loss": 3.9577, "loss/crossentropy": 2.284528613090515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084675058722496, "step": 14694 }, { "epoch": 0.29392, "grad_norm": 2.09375, "grad_norm_var": 0.009991200764973958, "learning_rate": 0.0001, "loss": 4.1159, "loss/crossentropy": 2.3619518280029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23107902705669403, "step": 14696 }, { "epoch": 0.29396, "grad_norm": 2.0625, "grad_norm_var": 0.009870402018229167, "learning_rate": 0.0001, "loss": 4.1488, "loss/crossentropy": 2.279863119125366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21235084533691406, "step": 14698 }, { "epoch": 0.294, "grad_norm": 1.9921875, "grad_norm_var": 0.009639231363932292, "learning_rate": 0.0001, "loss": 4.2216, "loss/crossentropy": 2.128455936908722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21381118893623352, "step": 14700 }, { "epoch": 0.29404, "grad_norm": 1.8515625, "grad_norm_var": 0.009014638264973958, "learning_rate": 0.0001, "loss": 4.0147, "loss/crossentropy": 2.265346884727478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086036428809166, "step": 14702 }, { "epoch": 0.29408, "grad_norm": 2.0, "grad_norm_var": 0.008421834309895833, "learning_rate": 0.0001, "loss": 4.2778, "loss/crossentropy": 2.4323991537094116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21795648336410522, "step": 14704 }, { "epoch": 0.29412, "grad_norm": 2.203125, "grad_norm_var": 0.30881729125976565, "learning_rate": 0.0001, "loss": 4.3864, "loss/crossentropy": 2.2948319911956787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23302766680717468, "step": 14706 }, { "epoch": 0.29416, "grad_norm": 1.9765625, "grad_norm_var": 0.3063140869140625, "learning_rate": 0.0001, "loss": 4.121, "loss/crossentropy": 2.1273980140686035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240317091345787, "step": 14708 }, { "epoch": 0.2942, "grad_norm": 1.828125, "grad_norm_var": 0.304357655843099, "learning_rate": 0.0001, "loss": 3.7468, "loss/crossentropy": 2.0593321323394775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19727306067943573, "step": 14710 }, { "epoch": 0.29424, "grad_norm": 1.9296875, "grad_norm_var": 0.3060373942057292, "learning_rate": 0.0001, "loss": 3.9979, "loss/crossentropy": 2.339016914367676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2313404530286789, "step": 14712 }, { "epoch": 0.29428, "grad_norm": 1.9921875, "grad_norm_var": 0.30822652180989585, "learning_rate": 0.0001, "loss": 3.9406, "loss/crossentropy": 2.1587395668029785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20085373520851135, "step": 14714 }, { "epoch": 0.29432, "grad_norm": 2.078125, "grad_norm_var": 0.3103912353515625, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 1.8317759037017822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20150133967399597, "step": 14716 }, { "epoch": 0.29436, "grad_norm": 1.7890625, "grad_norm_var": 0.31324437459309895, "learning_rate": 0.0001, "loss": 3.9291, "loss/crossentropy": 1.849327266216278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17959115654230118, "step": 14718 }, { "epoch": 0.2944, "grad_norm": 1.8828125, "grad_norm_var": 0.32099507649739584, "learning_rate": 0.0001, "loss": 3.7933, "loss/crossentropy": 1.652997612953186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17243028432130814, "step": 14720 }, { "epoch": 0.29444, "grad_norm": 2.03125, "grad_norm_var": 0.012995402018229166, "learning_rate": 0.0001, "loss": 4.2919, "loss/crossentropy": 1.968604028224945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20564967393875122, "step": 14722 }, { "epoch": 0.29448, "grad_norm": 1.9921875, "grad_norm_var": 0.007466634114583333, "learning_rate": 0.0001, "loss": 4.1211, "loss/crossentropy": 2.349083185195923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24004054814577103, "step": 14724 }, { "epoch": 0.29452, "grad_norm": 1.96875, "grad_norm_var": 0.006200917561848958, "learning_rate": 0.0001, "loss": 4.2078, "loss/crossentropy": 2.1629676818847656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2260306179523468, "step": 14726 }, { "epoch": 0.29456, "grad_norm": 2.265625, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 4.5995, "loss/crossentropy": 2.221126079559326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21054977923631668, "step": 14728 }, { "epoch": 0.2946, "grad_norm": 2.234375, "grad_norm_var": 0.3638987223307292, "learning_rate": 0.0001, "loss": 4.2588, "loss/crossentropy": 2.0333986282348633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27580036222934723, "step": 14730 }, { "epoch": 0.29464, "grad_norm": 2.09375, "grad_norm_var": 0.36292699178059895, "learning_rate": 0.0001, "loss": 4.2601, "loss/crossentropy": 2.0025470852851868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20166245102882385, "step": 14732 }, { "epoch": 0.29468, "grad_norm": 2.046875, "grad_norm_var": 0.35410868326822914, "learning_rate": 0.0001, "loss": 4.1044, "loss/crossentropy": 2.2882933616638184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972086951136589, "step": 14734 }, { "epoch": 0.29472, "grad_norm": 2.125, "grad_norm_var": 0.3395342508951823, "learning_rate": 0.0001, "loss": 4.1616, "loss/crossentropy": 2.2404085397720337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22015579044818878, "step": 14736 }, { "epoch": 0.29476, "grad_norm": 2.234375, "grad_norm_var": 0.33505223592122396, "learning_rate": 0.0001, "loss": 4.0132, "loss/crossentropy": 1.9309356808662415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082076519727707, "step": 14738 }, { "epoch": 0.2948, "grad_norm": 2.046875, "grad_norm_var": 0.33227437337239585, "learning_rate": 0.0001, "loss": 4.1652, "loss/crossentropy": 2.2118934988975525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20175088196992874, "step": 14740 }, { "epoch": 0.29484, "grad_norm": 2.03125, "grad_norm_var": 0.36171875, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 2.19997900724411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22470968216657639, "step": 14742 }, { "epoch": 0.29488, "grad_norm": 2.09375, "grad_norm_var": 0.365966796875, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 1.8140272498130798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20032335817813873, "step": 14744 }, { "epoch": 0.29492, "grad_norm": 1.9609375, "grad_norm_var": 0.07648086547851562, "learning_rate": 0.0001, "loss": 3.957, "loss/crossentropy": 2.0233620405197144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20464400947093964, "step": 14746 }, { "epoch": 0.29496, "grad_norm": 2.390625, "grad_norm_var": 0.07643407185872396, "learning_rate": 0.0001, "loss": 4.3946, "loss/crossentropy": 2.3900705575942993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23763202875852585, "step": 14748 }, { "epoch": 0.295, "grad_norm": 1.9765625, "grad_norm_var": 0.07768325805664063, "learning_rate": 0.0001, "loss": 4.205, "loss/crossentropy": 1.9895538687705994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736829191446304, "step": 14750 }, { "epoch": 0.29504, "grad_norm": 2.03125, "grad_norm_var": 0.0857421875, "learning_rate": 0.0001, "loss": 4.1454, "loss/crossentropy": 1.7315555810928345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839754432439804, "step": 14752 }, { "epoch": 0.29508, "grad_norm": 2.0625, "grad_norm_var": 0.08105367024739583, "learning_rate": 0.0001, "loss": 4.5482, "loss/crossentropy": 2.44241464138031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505839467048645, "step": 14754 }, { "epoch": 0.29512, "grad_norm": 2.1875, "grad_norm_var": 0.0849029541015625, "learning_rate": 0.0001, "loss": 3.782, "loss/crossentropy": 1.7948896884918213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987960934638977, "step": 14756 }, { "epoch": 0.29516, "grad_norm": 2.125, "grad_norm_var": 0.020334625244140626, "learning_rate": 0.0001, "loss": 4.0898, "loss/crossentropy": 1.710760235786438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18624412268400192, "step": 14758 }, { "epoch": 0.2952, "grad_norm": 2.390625, "grad_norm_var": 0.027205149332682293, "learning_rate": 0.0001, "loss": 4.4651, "loss/crossentropy": 2.0127750635147095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20178033411502838, "step": 14760 }, { "epoch": 0.29524, "grad_norm": 2.0, "grad_norm_var": 0.027042388916015625, "learning_rate": 0.0001, "loss": 4.1152, "loss/crossentropy": 1.9764790534973145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19000716507434845, "step": 14762 }, { "epoch": 0.29528, "grad_norm": 2.140625, "grad_norm_var": 0.020930735270182292, "learning_rate": 0.0001, "loss": 3.9634, "loss/crossentropy": 2.0221771597862244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980113685131073, "step": 14764 }, { "epoch": 0.29532, "grad_norm": 2.09375, "grad_norm_var": 0.019972483317057293, "learning_rate": 0.0001, "loss": 4.1721, "loss/crossentropy": 2.278168559074402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231141597032547, "step": 14766 }, { "epoch": 0.29536, "grad_norm": 1.953125, "grad_norm_var": 0.0168853759765625, "learning_rate": 0.0001, "loss": 4.1308, "loss/crossentropy": 2.1837246417999268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026391476392746, "step": 14768 }, { "epoch": 0.2954, "grad_norm": 1.890625, "grad_norm_var": 0.019795735677083332, "learning_rate": 0.0001, "loss": 4.0694, "loss/crossentropy": 2.1170668601989746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1871805638074875, "step": 14770 }, { "epoch": 0.29544, "grad_norm": 2.125, "grad_norm_var": 0.01784032185872396, "learning_rate": 0.0001, "loss": 4.1003, "loss/crossentropy": 1.9905366897583008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951729580760002, "step": 14772 }, { "epoch": 0.29548, "grad_norm": 2.03125, "grad_norm_var": 0.017146809895833334, "learning_rate": 0.0001, "loss": 3.9811, "loss/crossentropy": 1.781678318977356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19464702904224396, "step": 14774 }, { "epoch": 0.29552, "grad_norm": 1.875, "grad_norm_var": 0.012303670247395834, "learning_rate": 0.0001, "loss": 4.1944, "loss/crossentropy": 2.2671462297439575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22269655019044876, "step": 14776 }, { "epoch": 0.29556, "grad_norm": 2.046875, "grad_norm_var": 0.012511952718098959, "learning_rate": 0.0001, "loss": 4.2128, "loss/crossentropy": 2.0294137001037598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21939906477928162, "step": 14778 }, { "epoch": 0.2956, "grad_norm": 1.9375, "grad_norm_var": 0.011264801025390625, "learning_rate": 0.0001, "loss": 4.1295, "loss/crossentropy": 1.9174134731292725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19592813402414322, "step": 14780 }, { "epoch": 0.29564, "grad_norm": 2.0, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 3.9877, "loss/crossentropy": 2.0033875703811646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121451273560524, "step": 14782 }, { "epoch": 0.29568, "grad_norm": 1.9765625, "grad_norm_var": 0.09626439412434896, "learning_rate": 0.0001, "loss": 4.0263, "loss/crossentropy": 2.209542691707611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18373841792345047, "step": 14784 }, { "epoch": 0.29572, "grad_norm": 2.015625, "grad_norm_var": 0.09244155883789062, "learning_rate": 0.0001, "loss": 4.149, "loss/crossentropy": 2.0651984214782715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20486845821142197, "step": 14786 }, { "epoch": 0.29576, "grad_norm": 2.03125, "grad_norm_var": 0.09123433430989583, "learning_rate": 0.0001, "loss": 4.1697, "loss/crossentropy": 2.311215043067932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22553718090057373, "step": 14788 }, { "epoch": 0.2958, "grad_norm": 2.234375, "grad_norm_var": 0.09251708984375, "learning_rate": 0.0001, "loss": 4.3291, "loss/crossentropy": 2.0282764434814453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748327672481537, "step": 14790 }, { "epoch": 0.29584, "grad_norm": 1.9765625, "grad_norm_var": 0.09575093587239583, "learning_rate": 0.0001, "loss": 3.764, "loss/crossentropy": 2.04198157787323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19687633216381073, "step": 14792 }, { "epoch": 0.29588, "grad_norm": 2.125, "grad_norm_var": 0.09601236979166666, "learning_rate": 0.0001, "loss": 4.389, "loss/crossentropy": 2.049328565597534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20265275985002518, "step": 14794 }, { "epoch": 0.29592, "grad_norm": 1.984375, "grad_norm_var": 0.0992876688639323, "learning_rate": 0.0001, "loss": 3.9118, "loss/crossentropy": 1.905173659324646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193840891122818, "step": 14796 }, { "epoch": 0.29596, "grad_norm": 2.078125, "grad_norm_var": 0.09698486328125, "learning_rate": 0.0001, "loss": 3.941, "loss/crossentropy": 1.8715303540229797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18632445484399796, "step": 14798 }, { "epoch": 0.296, "grad_norm": 1.953125, "grad_norm_var": 0.013765207926432292, "learning_rate": 0.0001, "loss": 4.039, "loss/crossentropy": 2.0428889989852905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19538581371307373, "step": 14800 }, { "epoch": 0.29604, "grad_norm": 2.203125, "grad_norm_var": 0.015553538004557292, "learning_rate": 0.0001, "loss": 4.0834, "loss/crossentropy": 2.2959046363830566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2372349202632904, "step": 14802 }, { "epoch": 0.29608, "grad_norm": 1.96875, "grad_norm_var": 0.016190592447916666, "learning_rate": 0.0001, "loss": 4.0399, "loss/crossentropy": 2.083451807498932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20915375649929047, "step": 14804 }, { "epoch": 0.29612, "grad_norm": 2.03125, "grad_norm_var": 0.0132232666015625, "learning_rate": 0.0001, "loss": 4.293, "loss/crossentropy": 2.31546950340271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20648062229156494, "step": 14806 }, { "epoch": 0.29616, "grad_norm": 1.96875, "grad_norm_var": 0.013019816080729166, "learning_rate": 0.0001, "loss": 3.7273, "loss/crossentropy": 1.9833638072013855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19132772833108902, "step": 14808 }, { "epoch": 0.2962, "grad_norm": 1.8984375, "grad_norm_var": 0.013655344645182291, "learning_rate": 0.0001, "loss": 3.8247, "loss/crossentropy": 1.784917414188385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19032159447669983, "step": 14810 }, { "epoch": 0.29624, "grad_norm": 1.9921875, "grad_norm_var": 0.011263020833333333, "learning_rate": 0.0001, "loss": 4.0011, "loss/crossentropy": 1.843966782093048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22127597779035568, "step": 14812 }, { "epoch": 0.29628, "grad_norm": 2.921875, "grad_norm_var": 0.06729100545247396, "learning_rate": 0.0001, "loss": 4.0535, "loss/crossentropy": 1.8107360005378723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18275444209575653, "step": 14814 }, { "epoch": 0.29632, "grad_norm": 2.078125, "grad_norm_var": 0.06665827433268229, "learning_rate": 0.0001, "loss": 4.2711, "loss/crossentropy": 2.147618055343628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23800157755613327, "step": 14816 }, { "epoch": 0.29636, "grad_norm": 2.0625, "grad_norm_var": 0.06475804646809896, "learning_rate": 0.0001, "loss": 4.267, "loss/crossentropy": 2.4430278539657593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2401380091905594, "step": 14818 }, { "epoch": 0.2964, "grad_norm": 2.078125, "grad_norm_var": 0.06377665201822917, "learning_rate": 0.0001, "loss": 4.3204, "loss/crossentropy": 2.181501626968384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22491848468780518, "step": 14820 }, { "epoch": 0.29644, "grad_norm": 2.046875, "grad_norm_var": 0.06398111979166667, "learning_rate": 0.0001, "loss": 4.0375, "loss/crossentropy": 2.101313889026642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21598663926124573, "step": 14822 }, { "epoch": 0.29648, "grad_norm": 1.9765625, "grad_norm_var": 0.059020741780598955, "learning_rate": 0.0001, "loss": 3.9629, "loss/crossentropy": 1.6834549307823181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19430063664913177, "step": 14824 }, { "epoch": 0.29652, "grad_norm": 1.9921875, "grad_norm_var": 0.0566162109375, "learning_rate": 0.0001, "loss": 4.0087, "loss/crossentropy": 1.8400229215621948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18764454871416092, "step": 14826 }, { "epoch": 0.29656, "grad_norm": 2.015625, "grad_norm_var": 0.05609944661458333, "learning_rate": 0.0001, "loss": 4.1427, "loss/crossentropy": 1.9417667388916016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21304062008857727, "step": 14828 }, { "epoch": 0.2966, "grad_norm": 2.046875, "grad_norm_var": 0.003639475504557292, "learning_rate": 0.0001, "loss": 4.125, "loss/crossentropy": 2.227192521095276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23011230677366257, "step": 14830 }, { "epoch": 0.29664, "grad_norm": 2.015625, "grad_norm_var": 0.0029436747233072915, "learning_rate": 0.0001, "loss": 4.3496, "loss/crossentropy": 2.2136365175247192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21694285422563553, "step": 14832 }, { "epoch": 0.29668, "grad_norm": 1.890625, "grad_norm_var": 0.004137929280598958, "learning_rate": 0.0001, "loss": 3.9464, "loss/crossentropy": 1.9179469347000122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996636986732483, "step": 14834 }, { "epoch": 0.29672, "grad_norm": 1.9921875, "grad_norm_var": 0.0043718973795572914, "learning_rate": 0.0001, "loss": 3.9001, "loss/crossentropy": 1.9639176726341248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17738713324069977, "step": 14836 }, { "epoch": 0.29676, "grad_norm": 2.0625, "grad_norm_var": 0.004564412434895833, "learning_rate": 0.0001, "loss": 4.2665, "loss/crossentropy": 1.7748695611953735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18663641810417175, "step": 14838 }, { "epoch": 0.2968, "grad_norm": 1.8046875, "grad_norm_var": 0.007054646809895833, "learning_rate": 0.0001, "loss": 3.9879, "loss/crossentropy": 2.094521999359131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18323734402656555, "step": 14840 }, { "epoch": 0.29684, "grad_norm": 2.3125, "grad_norm_var": 0.01407470703125, "learning_rate": 0.0001, "loss": 4.5969, "loss/crossentropy": 2.488257050514221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23173442482948303, "step": 14842 }, { "epoch": 0.29688, "grad_norm": 2.046875, "grad_norm_var": 0.013444010416666667, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.2317086458206177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046324908733368, "step": 14844 }, { "epoch": 0.29692, "grad_norm": 2.09375, "grad_norm_var": 0.014134724934895834, "learning_rate": 0.0001, "loss": 3.9917, "loss/crossentropy": 2.275767207145691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20739831775426865, "step": 14846 }, { "epoch": 0.29696, "grad_norm": 1.9765625, "grad_norm_var": 0.014501698811848958, "learning_rate": 0.0001, "loss": 3.9728, "loss/crossentropy": 2.0288257002830505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18860980868339539, "step": 14848 }, { "epoch": 0.297, "grad_norm": 2.140625, "grad_norm_var": 0.015449778238932291, "learning_rate": 0.0001, "loss": 4.3509, "loss/crossentropy": 2.0609896183013916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20585624873638153, "step": 14850 }, { "epoch": 0.29704, "grad_norm": 1.96875, "grad_norm_var": 0.018214670817057292, "learning_rate": 0.0001, "loss": 3.5485, "loss/crossentropy": 1.9250158667564392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843906193971634, "step": 14852 }, { "epoch": 0.29708, "grad_norm": 1.96875, "grad_norm_var": 0.018070475260416666, "learning_rate": 0.0001, "loss": 3.9761, "loss/crossentropy": 2.118358612060547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2035902589559555, "step": 14854 }, { "epoch": 0.29712, "grad_norm": 1.90625, "grad_norm_var": 0.015610504150390624, "learning_rate": 0.0001, "loss": 4.135, "loss/crossentropy": 1.8381852507591248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1859317123889923, "step": 14856 }, { "epoch": 0.29716, "grad_norm": 1.9140625, "grad_norm_var": 0.009513346354166667, "learning_rate": 0.0001, "loss": 4.1235, "loss/crossentropy": 2.078063726425171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21035100519657135, "step": 14858 }, { "epoch": 0.2972, "grad_norm": 2.078125, "grad_norm_var": 0.010701497395833334, "learning_rate": 0.0001, "loss": 4.0549, "loss/crossentropy": 2.1546566486358643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19998939335346222, "step": 14860 }, { "epoch": 0.29724, "grad_norm": 2.046875, "grad_norm_var": 0.010164133707682292, "learning_rate": 0.0001, "loss": 4.2264, "loss/crossentropy": 2.019156754016876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1957200989127159, "step": 14862 }, { "epoch": 0.29728, "grad_norm": 2.140625, "grad_norm_var": 0.013138834635416667, "learning_rate": 0.0001, "loss": 4.0808, "loss/crossentropy": 1.8542814254760742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19331540167331696, "step": 14864 }, { "epoch": 0.29732, "grad_norm": 1.9453125, "grad_norm_var": 0.010895792643229167, "learning_rate": 0.0001, "loss": 4.2488, "loss/crossentropy": 2.241651773452759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22117780148983002, "step": 14866 }, { "epoch": 0.29736, "grad_norm": 1.9609375, "grad_norm_var": 0.011226145426432292, "learning_rate": 0.0001, "loss": 4.2139, "loss/crossentropy": 1.8856277465820312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19262754172086716, "step": 14868 }, { "epoch": 0.2974, "grad_norm": 2.203125, "grad_norm_var": 0.013388824462890626, "learning_rate": 0.0001, "loss": 4.3315, "loss/crossentropy": 2.0159581899642944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21892660111188889, "step": 14870 }, { "epoch": 0.29744, "grad_norm": 1.828125, "grad_norm_var": 0.014562733968098958, "learning_rate": 0.0001, "loss": 3.8965, "loss/crossentropy": 1.8794240355491638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18827088177204132, "step": 14872 }, { "epoch": 0.29748, "grad_norm": 1.9921875, "grad_norm_var": 0.013726552327473959, "learning_rate": 0.0001, "loss": 4.2176, "loss/crossentropy": 2.3741602897644043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2292410209774971, "step": 14874 }, { "epoch": 0.29752, "grad_norm": 1.828125, "grad_norm_var": 0.014025624593098958, "learning_rate": 0.0001, "loss": 3.8625, "loss/crossentropy": 1.9994327425956726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21266867220401764, "step": 14876 }, { "epoch": 0.29756, "grad_norm": 2.125, "grad_norm_var": 0.014143880208333333, "learning_rate": 0.0001, "loss": 4.1337, "loss/crossentropy": 2.0914021730422974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071390450000763, "step": 14878 }, { "epoch": 0.2976, "grad_norm": 1.8828125, "grad_norm_var": 0.012798817952473958, "learning_rate": 0.0001, "loss": 4.0463, "loss/crossentropy": 1.8055935502052307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19375848025083542, "step": 14880 }, { "epoch": 0.29764, "grad_norm": 2.03125, "grad_norm_var": 0.012896474202473958, "learning_rate": 0.0001, "loss": 4.3662, "loss/crossentropy": 2.614544630050659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22156962752342224, "step": 14882 }, { "epoch": 0.29768, "grad_norm": 1.8515625, "grad_norm_var": 0.013205718994140626, "learning_rate": 0.0001, "loss": 3.6502, "loss/crossentropy": 1.7306513786315918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17159338295459747, "step": 14884 }, { "epoch": 0.29772, "grad_norm": 2.046875, "grad_norm_var": 0.010060373942057292, "learning_rate": 0.0001, "loss": 4.0683, "loss/crossentropy": 2.060949981212616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20615212619304657, "step": 14886 }, { "epoch": 0.29776, "grad_norm": 1.8984375, "grad_norm_var": 0.008756510416666667, "learning_rate": 0.0001, "loss": 3.9903, "loss/crossentropy": 1.8338764309883118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274019241333008, "step": 14888 }, { "epoch": 0.2978, "grad_norm": 1.78125, "grad_norm_var": 0.013683827718098958, "learning_rate": 0.0001, "loss": 4.0335, "loss/crossentropy": 2.2182517051696777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22446341067552567, "step": 14890 }, { "epoch": 0.29784, "grad_norm": 2.09375, "grad_norm_var": 0.012951405843098958, "learning_rate": 0.0001, "loss": 4.3924, "loss/crossentropy": 2.222801446914673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22065874934196472, "step": 14892 }, { "epoch": 0.29788, "grad_norm": 1.8515625, "grad_norm_var": 0.012808990478515626, "learning_rate": 0.0001, "loss": 4.1315, "loss/crossentropy": 2.2181414365768433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21953265368938446, "step": 14894 }, { "epoch": 0.29792, "grad_norm": 1.984375, "grad_norm_var": 0.013166300455729167, "learning_rate": 0.0001, "loss": 3.654, "loss/crossentropy": 1.9304096102714539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20096193999052048, "step": 14896 }, { "epoch": 0.29796, "grad_norm": 2.0, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 4.2074, "loss/crossentropy": 2.2288341522216797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24108773469924927, "step": 14898 }, { "epoch": 0.298, "grad_norm": 2.15625, "grad_norm_var": 0.012550608317057291, "learning_rate": 0.0001, "loss": 4.191, "loss/crossentropy": 2.1711790561676025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2395533323287964, "step": 14900 }, { "epoch": 0.29804, "grad_norm": 2.03125, "grad_norm_var": 0.019482167561848958, "learning_rate": 0.0001, "loss": 4.2686, "loss/crossentropy": 2.0533857345581055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21801679581403732, "step": 14902 }, { "epoch": 0.29808, "grad_norm": 2.0625, "grad_norm_var": 0.01883519490559896, "learning_rate": 0.0001, "loss": 3.8678, "loss/crossentropy": 1.634634256362915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17861492931842804, "step": 14904 }, { "epoch": 0.29812, "grad_norm": 2.15625, "grad_norm_var": 0.0149322509765625, "learning_rate": 0.0001, "loss": 4.2152, "loss/crossentropy": 2.3151358366012573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22694342583417892, "step": 14906 }, { "epoch": 0.29816, "grad_norm": 2.03125, "grad_norm_var": 0.01585871378580729, "learning_rate": 0.0001, "loss": 4.085, "loss/crossentropy": 1.9165552258491516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18231045454740524, "step": 14908 }, { "epoch": 0.2982, "grad_norm": 1.8828125, "grad_norm_var": 0.01480712890625, "learning_rate": 0.0001, "loss": 3.9693, "loss/crossentropy": 2.0094637274742126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19547566026449203, "step": 14910 }, { "epoch": 0.29824, "grad_norm": 1.8515625, "grad_norm_var": 0.014753214518229167, "learning_rate": 0.0001, "loss": 3.9091, "loss/crossentropy": 1.8764418959617615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18009892851114273, "step": 14912 }, { "epoch": 0.29828, "grad_norm": 2.140625, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 4.429, "loss/crossentropy": 2.1255921721458435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20359216630458832, "step": 14914 }, { "epoch": 0.29832, "grad_norm": 1.9921875, "grad_norm_var": 0.014891560872395833, "learning_rate": 0.0001, "loss": 3.7889, "loss/crossentropy": 2.1316330432891846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970125362277031, "step": 14916 }, { "epoch": 0.29836, "grad_norm": 1.9453125, "grad_norm_var": 0.0089996337890625, "learning_rate": 0.0001, "loss": 4.0305, "loss/crossentropy": 1.7825700640678406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1902567446231842, "step": 14918 }, { "epoch": 0.2984, "grad_norm": 2.0, "grad_norm_var": 0.008565012613932292, "learning_rate": 0.0001, "loss": 3.9577, "loss/crossentropy": 1.982073962688446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20216460525989532, "step": 14920 }, { "epoch": 0.29844, "grad_norm": 1.953125, "grad_norm_var": 0.0060943603515625, "learning_rate": 0.0001, "loss": 4.04, "loss/crossentropy": 2.3210418224334717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2224876508116722, "step": 14922 }, { "epoch": 0.29848, "grad_norm": 2.03125, "grad_norm_var": 0.005535634358723959, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 1.7990338206291199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17401983588933945, "step": 14924 }, { "epoch": 0.29852, "grad_norm": 2.1875, "grad_norm_var": 0.010545857747395833, "learning_rate": 0.0001, "loss": 4.4979, "loss/crossentropy": 1.8772737979888916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21376933157444, "step": 14926 }, { "epoch": 0.29856, "grad_norm": 1.8984375, "grad_norm_var": 0.009716542561848958, "learning_rate": 0.0001, "loss": 4.0544, "loss/crossentropy": 1.6914128065109253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17860179394483566, "step": 14928 }, { "epoch": 0.2986, "grad_norm": 1.9609375, "grad_norm_var": 0.009691365559895833, "learning_rate": 0.0001, "loss": 3.9737, "loss/crossentropy": 1.9545430541038513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015315517783165, "step": 14930 }, { "epoch": 0.29864, "grad_norm": 1.921875, "grad_norm_var": 0.009175618489583334, "learning_rate": 0.0001, "loss": 3.9181, "loss/crossentropy": 2.0585074424743652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029297649860382, "step": 14932 }, { "epoch": 0.29868, "grad_norm": 1.8515625, "grad_norm_var": 0.009679921468098958, "learning_rate": 0.0001, "loss": 3.8638, "loss/crossentropy": 1.9196518063545227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970217153429985, "step": 14934 }, { "epoch": 0.29872, "grad_norm": 1.9453125, "grad_norm_var": 0.0097564697265625, "learning_rate": 0.0001, "loss": 4.1366, "loss/crossentropy": 2.1077409982681274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22366851568222046, "step": 14936 }, { "epoch": 0.29876, "grad_norm": 3.921875, "grad_norm_var": 0.24059829711914063, "learning_rate": 0.0001, "loss": 3.8552, "loss/crossentropy": 1.7660444974899292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23386957496404648, "step": 14938 }, { "epoch": 0.2988, "grad_norm": 2.21875, "grad_norm_var": 0.23871841430664062, "learning_rate": 0.0001, "loss": 4.2936, "loss/crossentropy": 1.8436493873596191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19589588046073914, "step": 14940 }, { "epoch": 0.29884, "grad_norm": 2.40625, "grad_norm_var": 0.24443333943684895, "learning_rate": 0.0001, "loss": 4.3465, "loss/crossentropy": 2.2123888731002808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26607823371887207, "step": 14942 }, { "epoch": 0.29888, "grad_norm": 2.5625, "grad_norm_var": 0.2546119689941406, "learning_rate": 0.0001, "loss": 4.3234, "loss/crossentropy": 2.70485258102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2529008686542511, "step": 14944 }, { "epoch": 0.29892, "grad_norm": 2.109375, "grad_norm_var": 0.25155843098958336, "learning_rate": 0.0001, "loss": 4.1379, "loss/crossentropy": 1.932865023612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22141847014427185, "step": 14946 }, { "epoch": 0.29896, "grad_norm": 1.9765625, "grad_norm_var": 0.25643310546875, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 2.213751196861267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21382954716682434, "step": 14948 }, { "epoch": 0.299, "grad_norm": 1.9609375, "grad_norm_var": 0.251073964436849, "learning_rate": 0.0001, "loss": 3.9673, "loss/crossentropy": 1.679059088230133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16484958678483963, "step": 14950 }, { "epoch": 0.29904, "grad_norm": 1.9453125, "grad_norm_var": 0.24972508748372396, "learning_rate": 0.0001, "loss": 3.9873, "loss/crossentropy": 1.748594582080841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20150119066238403, "step": 14952 }, { "epoch": 0.29908, "grad_norm": 2.5625, "grad_norm_var": 0.05215657552083333, "learning_rate": 0.0001, "loss": 4.8053, "loss/crossentropy": 2.4588215351104736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2513396218419075, "step": 14954 }, { "epoch": 0.29912, "grad_norm": 1.9453125, "grad_norm_var": 0.05468114217122396, "learning_rate": 0.0001, "loss": 4.0782, "loss/crossentropy": 2.1104516983032227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20738063752651215, "step": 14956 }, { "epoch": 0.29916, "grad_norm": 2.109375, "grad_norm_var": 0.04905776977539063, "learning_rate": 0.0001, "loss": 4.2351, "loss/crossentropy": 1.7171857953071594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27423302084207535, "step": 14958 }, { "epoch": 0.2992, "grad_norm": 1.890625, "grad_norm_var": 0.03264058430989583, "learning_rate": 0.0001, "loss": 3.7507, "loss/crossentropy": 1.5607159733772278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1616373285651207, "step": 14960 }, { "epoch": 0.29924, "grad_norm": 1.9765625, "grad_norm_var": 0.033394114176432295, "learning_rate": 0.0001, "loss": 4.0128, "loss/crossentropy": 1.9257365465164185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18031761050224304, "step": 14962 }, { "epoch": 0.29928, "grad_norm": 2.0, "grad_norm_var": 0.031062825520833334, "learning_rate": 0.0001, "loss": 3.9406, "loss/crossentropy": 1.876187801361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19531002640724182, "step": 14964 }, { "epoch": 0.29932, "grad_norm": 2.140625, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 4.2028, "loss/crossentropy": 1.9325042963027954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19496598094701767, "step": 14966 }, { "epoch": 0.29936, "grad_norm": 1.8515625, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 4.1293, "loss/crossentropy": 2.148952007293701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18833627551794052, "step": 14968 }, { "epoch": 0.2994, "grad_norm": 2.015625, "grad_norm_var": 0.008099110921223958, "learning_rate": 0.0001, "loss": 4.0954, "loss/crossentropy": 2.3639365434646606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21851496398448944, "step": 14970 }, { "epoch": 0.29944, "grad_norm": 1.8984375, "grad_norm_var": 0.008371734619140625, "learning_rate": 0.0001, "loss": 3.8026, "loss/crossentropy": 1.941165030002594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19548548012971878, "step": 14972 }, { "epoch": 0.29948, "grad_norm": 1.9453125, "grad_norm_var": 0.0082916259765625, "learning_rate": 0.0001, "loss": 4.1203, "loss/crossentropy": 2.030466139316559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1922580599784851, "step": 14974 }, { "epoch": 0.29952, "grad_norm": 2.0625, "grad_norm_var": 0.008487955729166666, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.2279865741729736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20831073075532913, "step": 14976 }, { "epoch": 0.29956, "grad_norm": 1.875, "grad_norm_var": 0.007047271728515625, "learning_rate": 0.0001, "loss": 4.1708, "loss/crossentropy": 2.0749244689941406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18032152205705643, "step": 14978 }, { "epoch": 0.2996, "grad_norm": 2.0625, "grad_norm_var": 0.04270731608072917, "learning_rate": 0.0001, "loss": 4.199, "loss/crossentropy": 2.3190104961395264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24735668301582336, "step": 14980 }, { "epoch": 0.29964, "grad_norm": 1.8203125, "grad_norm_var": 0.044406890869140625, "learning_rate": 0.0001, "loss": 3.9549, "loss/crossentropy": 1.970013439655304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19323202222585678, "step": 14982 }, { "epoch": 0.29968, "grad_norm": 1.9921875, "grad_norm_var": 0.045169830322265625, "learning_rate": 0.0001, "loss": 4.4762, "loss/crossentropy": 2.052343726158142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20909954607486725, "step": 14984 }, { "epoch": 0.29972, "grad_norm": 2.046875, "grad_norm_var": 0.043268839518229164, "learning_rate": 0.0001, "loss": 4.1776, "loss/crossentropy": 2.2586673498153687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22640614211559296, "step": 14986 }, { "epoch": 0.29976, "grad_norm": 1.9609375, "grad_norm_var": 0.042557779947916666, "learning_rate": 0.0001, "loss": 3.8465, "loss/crossentropy": 1.8843002319335938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19117384403944016, "step": 14988 }, { "epoch": 0.2998, "grad_norm": 1.9453125, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 3.7343, "loss/crossentropy": 1.542019009590149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18295453488826752, "step": 14990 }, { "epoch": 0.29984, "grad_norm": 1.9609375, "grad_norm_var": 0.043369293212890625, "learning_rate": 0.0001, "loss": 4.1335, "loss/crossentropy": 2.160573959350586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22916415333747864, "step": 14992 }, { "epoch": 0.29988, "grad_norm": 2.609375, "grad_norm_var": 0.062168121337890625, "learning_rate": 0.0001, "loss": 4.1898, "loss/crossentropy": 2.1296870708465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20498204976320267, "step": 14994 }, { "epoch": 0.29992, "grad_norm": 1.8671875, "grad_norm_var": 0.0336669921875, "learning_rate": 0.0001, "loss": 4.0554, "loss/crossentropy": 2.0356279015541077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21798957884311676, "step": 14996 }, { "epoch": 0.29996, "grad_norm": 2.015625, "grad_norm_var": 0.035796864827473955, "learning_rate": 0.0001, "loss": 4.6195, "loss/crossentropy": 2.466023027896881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22107623517513275, "step": 14998 }, { "epoch": 0.3, "grad_norm": 2.046875, "grad_norm_var": 0.03408203125, "learning_rate": 0.0001, "loss": 4.1455, "loss/crossentropy": 1.8457531332969666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21306351572275162, "step": 15000 }, { "epoch": 0.30004, "grad_norm": 2.078125, "grad_norm_var": 0.034211222330729166, "learning_rate": 0.0001, "loss": 4.5128, "loss/crossentropy": 2.202435612678528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21542339771986008, "step": 15002 }, { "epoch": 0.30008, "grad_norm": 1.9921875, "grad_norm_var": 0.032613118489583336, "learning_rate": 0.0001, "loss": 4.1489, "loss/crossentropy": 2.142255425453186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21016352623701096, "step": 15004 }, { "epoch": 0.30012, "grad_norm": 2.03125, "grad_norm_var": 0.030631256103515626, "learning_rate": 0.0001, "loss": 4.1443, "loss/crossentropy": 1.949910044670105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19256845861673355, "step": 15006 }, { "epoch": 0.30016, "grad_norm": 2.078125, "grad_norm_var": 0.029750315348307292, "learning_rate": 0.0001, "loss": 4.34, "loss/crossentropy": 2.257867217063904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22471557557582855, "step": 15008 }, { "epoch": 0.3002, "grad_norm": 2.140625, "grad_norm_var": 0.011358388264973958, "learning_rate": 0.0001, "loss": 4.3457, "loss/crossentropy": 2.1541898250579834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23054643720388412, "step": 15010 }, { "epoch": 0.30024, "grad_norm": 2.296875, "grad_norm_var": 0.01165771484375, "learning_rate": 0.0001, "loss": 4.2537, "loss/crossentropy": 2.0029123425483704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21815954893827438, "step": 15012 }, { "epoch": 0.30028, "grad_norm": 2.125, "grad_norm_var": 0.0076416015625, "learning_rate": 0.0001, "loss": 4.0093, "loss/crossentropy": 2.0617172718048096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19382908195257187, "step": 15014 }, { "epoch": 0.30032, "grad_norm": 1.953125, "grad_norm_var": 0.00921630859375, "learning_rate": 0.0001, "loss": 4.3123, "loss/crossentropy": 2.1493303775787354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21762489527463913, "step": 15016 }, { "epoch": 0.30036, "grad_norm": 2.0625, "grad_norm_var": 0.0093505859375, "learning_rate": 0.0001, "loss": 4.1282, "loss/crossentropy": 2.081854462623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22617685049772263, "step": 15018 }, { "epoch": 0.3004, "grad_norm": 1.953125, "grad_norm_var": 0.009842681884765624, "learning_rate": 0.0001, "loss": 4.0352, "loss/crossentropy": 1.8565402626991272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18223749846220016, "step": 15020 }, { "epoch": 0.30044, "grad_norm": 1.96875, "grad_norm_var": 0.010815175374348958, "learning_rate": 0.0001, "loss": 4.1779, "loss/crossentropy": 2.129835605621338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21263620257377625, "step": 15022 }, { "epoch": 0.30048, "grad_norm": 2.15625, "grad_norm_var": 0.011189524332682292, "learning_rate": 0.0001, "loss": 4.5131, "loss/crossentropy": 2.113425612449646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22930295765399933, "step": 15024 }, { "epoch": 0.30052, "grad_norm": 2.0, "grad_norm_var": 0.010109202067057291, "learning_rate": 0.0001, "loss": 4.1557, "loss/crossentropy": 2.254178762435913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21644078195095062, "step": 15026 }, { "epoch": 0.30056, "grad_norm": 1.9765625, "grad_norm_var": 0.011966959635416666, "learning_rate": 0.0001, "loss": 4.0863, "loss/crossentropy": 2.233784854412079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22132302820682526, "step": 15028 }, { "epoch": 0.3006, "grad_norm": 1.953125, "grad_norm_var": 0.0118804931640625, "learning_rate": 0.0001, "loss": 3.8657, "loss/crossentropy": 2.2234549522399902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144894078373909, "step": 15030 }, { "epoch": 0.30064, "grad_norm": 2.046875, "grad_norm_var": 0.0108306884765625, "learning_rate": 0.0001, "loss": 4.1315, "loss/crossentropy": 2.0401915907859802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20804419368505478, "step": 15032 }, { "epoch": 0.30068, "grad_norm": 1.953125, "grad_norm_var": 0.010978190104166667, "learning_rate": 0.0001, "loss": 4.0939, "loss/crossentropy": 2.058899462223053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222386747598648, "step": 15034 }, { "epoch": 0.30072, "grad_norm": 2.421875, "grad_norm_var": 0.020694986979166666, "learning_rate": 0.0001, "loss": 4.5452, "loss/crossentropy": 2.396425485610962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24734684079885483, "step": 15036 }, { "epoch": 0.30076, "grad_norm": 2.03125, "grad_norm_var": 0.02011693318684896, "learning_rate": 0.0001, "loss": 4.1124, "loss/crossentropy": 1.9825797080993652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20504453778266907, "step": 15038 }, { "epoch": 0.3008, "grad_norm": 2.015625, "grad_norm_var": 0.02068049112955729, "learning_rate": 0.0001, "loss": 4.0287, "loss/crossentropy": 1.9550088047981262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051691859960556, "step": 15040 }, { "epoch": 0.30084, "grad_norm": 2.015625, "grad_norm_var": 0.02102839152018229, "learning_rate": 0.0001, "loss": 4.1926, "loss/crossentropy": 2.166532874107361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20486898720264435, "step": 15042 }, { "epoch": 0.30088, "grad_norm": 1.921875, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 3.9905, "loss/crossentropy": 1.8752986192703247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20286446809768677, "step": 15044 }, { "epoch": 0.30092, "grad_norm": 2.125, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 4.3217, "loss/crossentropy": 1.9599428176879883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061413824558258, "step": 15046 }, { "epoch": 0.30096, "grad_norm": 2.015625, "grad_norm_var": 0.015428670247395833, "learning_rate": 0.0001, "loss": 4.2914, "loss/crossentropy": 2.5601563453674316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123739868402481, "step": 15048 }, { "epoch": 0.301, "grad_norm": 2.0, "grad_norm_var": 0.015062459309895833, "learning_rate": 0.0001, "loss": 4.2784, "loss/crossentropy": 2.075629949569702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20791151374578476, "step": 15050 }, { "epoch": 0.30104, "grad_norm": 2.171875, "grad_norm_var": 0.008532460530598958, "learning_rate": 0.0001, "loss": 4.1578, "loss/crossentropy": 1.8346505165100098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16620153188705444, "step": 15052 }, { "epoch": 0.30108, "grad_norm": 2.109375, "grad_norm_var": 0.009382120768229167, "learning_rate": 0.0001, "loss": 4.003, "loss/crossentropy": 1.9715532660484314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20792889595031738, "step": 15054 }, { "epoch": 0.30112, "grad_norm": 1.9296875, "grad_norm_var": 0.0090728759765625, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 2.135376811027527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218796044588089, "step": 15056 }, { "epoch": 0.30116, "grad_norm": 2.171875, "grad_norm_var": 0.00941162109375, "learning_rate": 0.0001, "loss": 4.4339, "loss/crossentropy": 2.095982074737549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2120692878961563, "step": 15058 }, { "epoch": 0.3012, "grad_norm": 2.203125, "grad_norm_var": 0.00911865234375, "learning_rate": 0.0001, "loss": 4.3395, "loss/crossentropy": 2.2836159467697144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23748520761728287, "step": 15060 }, { "epoch": 0.30124, "grad_norm": 2.03125, "grad_norm_var": 0.010163370768229167, "learning_rate": 0.0001, "loss": 3.8877, "loss/crossentropy": 2.073283314704895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19927848130464554, "step": 15062 }, { "epoch": 0.30128, "grad_norm": 2.015625, "grad_norm_var": 0.010196940104166666, "learning_rate": 0.0001, "loss": 4.1908, "loss/crossentropy": 2.2401102781295776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22298122942447662, "step": 15064 }, { "epoch": 0.30132, "grad_norm": 1.9140625, "grad_norm_var": 0.012650299072265624, "learning_rate": 0.0001, "loss": 4.1878, "loss/crossentropy": 2.063979387283325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19714537262916565, "step": 15066 }, { "epoch": 0.30136, "grad_norm": 2.0, "grad_norm_var": 0.012894694010416667, "learning_rate": 0.0001, "loss": 3.6407, "loss/crossentropy": 2.0346400141716003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18405266851186752, "step": 15068 }, { "epoch": 0.3014, "grad_norm": 1.8671875, "grad_norm_var": 0.013437652587890625, "learning_rate": 0.0001, "loss": 3.8641, "loss/crossentropy": 2.1211341619491577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19659722596406937, "step": 15070 }, { "epoch": 0.30144, "grad_norm": 3.484375, "grad_norm_var": 0.1490966796875, "learning_rate": 0.0001, "loss": 3.9967, "loss/crossentropy": 1.937787652015686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18829380720853806, "step": 15072 }, { "epoch": 0.30148, "grad_norm": 1.9140625, "grad_norm_var": 0.1507219950358073, "learning_rate": 0.0001, "loss": 3.922, "loss/crossentropy": 1.9047453999519348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18311911821365356, "step": 15074 }, { "epoch": 0.30152, "grad_norm": 2.171875, "grad_norm_var": 0.15110651652018228, "learning_rate": 0.0001, "loss": 4.3423, "loss/crossentropy": 2.1409407258033752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21341674029827118, "step": 15076 }, { "epoch": 0.30156, "grad_norm": 2.265625, "grad_norm_var": 0.15178197224934895, "learning_rate": 0.0001, "loss": 4.2815, "loss/crossentropy": 2.188783288002014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2406207174062729, "step": 15078 }, { "epoch": 0.3016, "grad_norm": 2.125, "grad_norm_var": 0.15162938435872395, "learning_rate": 0.0001, "loss": 4.368, "loss/crossentropy": 2.088913679122925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055455967783928, "step": 15080 }, { "epoch": 0.30164, "grad_norm": 1.9921875, "grad_norm_var": 0.15350316365559896, "learning_rate": 0.0001, "loss": 3.9039, "loss/crossentropy": 1.9477434754371643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19234874844551086, "step": 15082 }, { "epoch": 0.30168, "grad_norm": 1.9375, "grad_norm_var": 0.1490069071451823, "learning_rate": 0.0001, "loss": 4.2653, "loss/crossentropy": 2.2611928582191467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2250279188156128, "step": 15084 }, { "epoch": 0.30172, "grad_norm": 2.109375, "grad_norm_var": 0.14735514322916668, "learning_rate": 0.0001, "loss": 4.043, "loss/crossentropy": 2.340996742248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201436087489128, "step": 15086 }, { "epoch": 0.30176, "grad_norm": 1.8515625, "grad_norm_var": 0.01718724568684896, "learning_rate": 0.0001, "loss": 3.9232, "loss/crossentropy": 2.045587956905365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049652263522148, "step": 15088 }, { "epoch": 0.3018, "grad_norm": 1.9453125, "grad_norm_var": 0.015265909830729167, "learning_rate": 0.0001, "loss": 3.9266, "loss/crossentropy": 1.6079826951026917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1584620103240013, "step": 15090 }, { "epoch": 0.30184, "grad_norm": 2.046875, "grad_norm_var": 0.013494618733723958, "learning_rate": 0.0001, "loss": 4.1049, "loss/crossentropy": 2.122701048851013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2095094472169876, "step": 15092 }, { "epoch": 0.30188, "grad_norm": 1.8828125, "grad_norm_var": 0.008243560791015625, "learning_rate": 0.0001, "loss": 3.8602, "loss/crossentropy": 1.8888922929763794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19282665848731995, "step": 15094 }, { "epoch": 0.30192, "grad_norm": 1.9140625, "grad_norm_var": 0.0063873291015625, "learning_rate": 0.0001, "loss": 4.1279, "loss/crossentropy": 2.121204972267151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894393339753151, "step": 15096 }, { "epoch": 0.30196, "grad_norm": 1.890625, "grad_norm_var": 0.005991363525390625, "learning_rate": 0.0001, "loss": 4.0433, "loss/crossentropy": 1.836807131767273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851387470960617, "step": 15098 }, { "epoch": 0.302, "grad_norm": 1.96875, "grad_norm_var": 0.005246734619140625, "learning_rate": 0.0001, "loss": 4.1723, "loss/crossentropy": 2.170082688331604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20964065939188004, "step": 15100 }, { "epoch": 0.30204, "grad_norm": 2.015625, "grad_norm_var": 0.0037595113118489582, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 2.128955125808716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20737067610025406, "step": 15102 }, { "epoch": 0.30208, "grad_norm": 1.890625, "grad_norm_var": 0.0062744140625, "learning_rate": 0.0001, "loss": 4.1561, "loss/crossentropy": 1.970711886882782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19101983308792114, "step": 15104 }, { "epoch": 0.30212, "grad_norm": 2.40625, "grad_norm_var": 0.018040974934895832, "learning_rate": 0.0001, "loss": 4.5089, "loss/crossentropy": 2.105665922164917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23741377890110016, "step": 15106 }, { "epoch": 0.30216, "grad_norm": 2.109375, "grad_norm_var": 0.01962865193684896, "learning_rate": 0.0001, "loss": 4.3766, "loss/crossentropy": 2.274402379989624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20633316040039062, "step": 15108 }, { "epoch": 0.3022, "grad_norm": 2.046875, "grad_norm_var": 0.017996978759765626, "learning_rate": 0.0001, "loss": 4.0393, "loss/crossentropy": 2.134031891822815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20932414382696152, "step": 15110 }, { "epoch": 0.30224, "grad_norm": 1.9140625, "grad_norm_var": 0.01930720011393229, "learning_rate": 0.0001, "loss": 3.9084, "loss/crossentropy": 2.015448212623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18043135851621628, "step": 15112 }, { "epoch": 0.30228, "grad_norm": 2.109375, "grad_norm_var": 0.018232981363932293, "learning_rate": 0.0001, "loss": 4.2025, "loss/crossentropy": 2.3585838079452515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2323882058262825, "step": 15114 }, { "epoch": 0.30232, "grad_norm": 1.9140625, "grad_norm_var": 0.018607584635416667, "learning_rate": 0.0001, "loss": 3.8586, "loss/crossentropy": 1.790964961051941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18150723725557327, "step": 15116 }, { "epoch": 0.30236, "grad_norm": 2.109375, "grad_norm_var": 0.018802642822265625, "learning_rate": 0.0001, "loss": 4.1433, "loss/crossentropy": 2.0206486582756042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102368101477623, "step": 15118 }, { "epoch": 0.3024, "grad_norm": 1.9921875, "grad_norm_var": 0.016169230143229168, "learning_rate": 0.0001, "loss": 4.1773, "loss/crossentropy": 2.21474289894104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22742661088705063, "step": 15120 }, { "epoch": 0.30244, "grad_norm": 2.1875, "grad_norm_var": 0.009781901041666667, "learning_rate": 0.0001, "loss": 3.9536, "loss/crossentropy": 2.0452207922935486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19045353680849075, "step": 15122 }, { "epoch": 0.30248, "grad_norm": 1.828125, "grad_norm_var": 0.0105865478515625, "learning_rate": 0.0001, "loss": 3.7914, "loss/crossentropy": 2.019612729549408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032184973359108, "step": 15124 }, { "epoch": 0.30252, "grad_norm": 1.9921875, "grad_norm_var": 0.010221099853515625, "learning_rate": 0.0001, "loss": 4.0303, "loss/crossentropy": 2.1821396350860596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22832991182804108, "step": 15126 }, { "epoch": 0.30256, "grad_norm": 1.875, "grad_norm_var": 0.010088857014973958, "learning_rate": 0.0001, "loss": 3.7838, "loss/crossentropy": 2.0395348072052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18255837261676788, "step": 15128 }, { "epoch": 0.3026, "grad_norm": 1.9609375, "grad_norm_var": 0.009081013997395833, "learning_rate": 0.0001, "loss": 4.177, "loss/crossentropy": 2.1724050045013428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19672775268554688, "step": 15130 }, { "epoch": 0.30264, "grad_norm": 1.96875, "grad_norm_var": 0.008470662434895833, "learning_rate": 0.0001, "loss": 4.2438, "loss/crossentropy": 2.107967436313629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19564508646726608, "step": 15132 }, { "epoch": 0.30268, "grad_norm": 1.953125, "grad_norm_var": 0.007355753580729167, "learning_rate": 0.0001, "loss": 3.8674, "loss/crossentropy": 1.7179370522499084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17135357856750488, "step": 15134 }, { "epoch": 0.30272, "grad_norm": 2.078125, "grad_norm_var": 0.008119455973307292, "learning_rate": 0.0001, "loss": 4.104, "loss/crossentropy": 1.9004579186439514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19902431219816208, "step": 15136 }, { "epoch": 0.30276, "grad_norm": 2.0625, "grad_norm_var": 0.0052487691243489586, "learning_rate": 0.0001, "loss": 4.013, "loss/crossentropy": 1.9554911255836487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19999338686466217, "step": 15138 }, { "epoch": 0.3028, "grad_norm": 1.9140625, "grad_norm_var": 0.004793294270833333, "learning_rate": 0.0001, "loss": 4.1051, "loss/crossentropy": 1.8687474131584167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19250088185071945, "step": 15140 }, { "epoch": 0.30284, "grad_norm": 2.0625, "grad_norm_var": 0.004929351806640625, "learning_rate": 0.0001, "loss": 4.1217, "loss/crossentropy": 2.1360538005828857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22316065430641174, "step": 15142 }, { "epoch": 0.30288, "grad_norm": 2.125, "grad_norm_var": 0.006776682535807292, "learning_rate": 0.0001, "loss": 3.7681, "loss/crossentropy": 1.6100040078163147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16926150023937225, "step": 15144 }, { "epoch": 0.30292, "grad_norm": 2.09375, "grad_norm_var": 0.0070302327473958336, "learning_rate": 0.0001, "loss": 4.0459, "loss/crossentropy": 1.983083188533783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19075944274663925, "step": 15146 }, { "epoch": 0.30296, "grad_norm": 1.9609375, "grad_norm_var": 0.009110514322916667, "learning_rate": 0.0001, "loss": 3.8722, "loss/crossentropy": 1.9027757048606873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069152221083641, "step": 15148 }, { "epoch": 0.303, "grad_norm": 1.953125, "grad_norm_var": 0.009144846598307292, "learning_rate": 0.0001, "loss": 3.996, "loss/crossentropy": 1.9141475558280945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19106873124837875, "step": 15150 }, { "epoch": 0.30304, "grad_norm": 2.109375, "grad_norm_var": 0.009821573893229166, "learning_rate": 0.0001, "loss": 4.0872, "loss/crossentropy": 1.924220085144043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19870451837778091, "step": 15152 }, { "epoch": 0.30308, "grad_norm": 1.9921875, "grad_norm_var": 0.009169260660807291, "learning_rate": 0.0001, "loss": 4.1611, "loss/crossentropy": 2.18874990940094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040283977985382, "step": 15154 }, { "epoch": 0.30312, "grad_norm": 1.9140625, "grad_norm_var": 0.0096588134765625, "learning_rate": 0.0001, "loss": 3.6905, "loss/crossentropy": 2.0296109914779663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20384181290864944, "step": 15156 }, { "epoch": 0.30316, "grad_norm": 2.03125, "grad_norm_var": 0.0091949462890625, "learning_rate": 0.0001, "loss": 4.2475, "loss/crossentropy": 2.1908326148986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775151252746582, "step": 15158 }, { "epoch": 0.3032, "grad_norm": 2.0625, "grad_norm_var": 0.007523345947265625, "learning_rate": 0.0001, "loss": 4.191, "loss/crossentropy": 2.1348751187324524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21342730522155762, "step": 15160 }, { "epoch": 0.30324, "grad_norm": 2.09375, "grad_norm_var": 0.0075103759765625, "learning_rate": 0.0001, "loss": 3.9642, "loss/crossentropy": 2.03944593667984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934272050857544, "step": 15162 }, { "epoch": 0.30328, "grad_norm": 1.9609375, "grad_norm_var": 0.005475870768229167, "learning_rate": 0.0001, "loss": 3.9692, "loss/crossentropy": 1.8574647307395935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1847086101770401, "step": 15164 }, { "epoch": 0.30332, "grad_norm": 1.9140625, "grad_norm_var": 0.005671183268229167, "learning_rate": 0.0001, "loss": 4.3025, "loss/crossentropy": 2.219611406326294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982950195670128, "step": 15166 }, { "epoch": 0.30336, "grad_norm": 2.078125, "grad_norm_var": 0.005159250895182292, "learning_rate": 0.0001, "loss": 4.1903, "loss/crossentropy": 2.024726688861847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217759907245636, "step": 15168 }, { "epoch": 0.3034, "grad_norm": 2.046875, "grad_norm_var": 0.005353800455729167, "learning_rate": 0.0001, "loss": 4.2273, "loss/crossentropy": 2.196849226951599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21290963143110275, "step": 15170 }, { "epoch": 0.30344, "grad_norm": 1.7421875, "grad_norm_var": 0.00750732421875, "learning_rate": 0.0001, "loss": 4.0096, "loss/crossentropy": 1.9023171067237854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19199151545763016, "step": 15172 }, { "epoch": 0.30348, "grad_norm": 2.03125, "grad_norm_var": 0.008644358317057291, "learning_rate": 0.0001, "loss": 4.0798, "loss/crossentropy": 2.1286264657974243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20934996753931046, "step": 15174 }, { "epoch": 0.30352, "grad_norm": 2.046875, "grad_norm_var": 0.008007558186848958, "learning_rate": 0.0001, "loss": 4.2621, "loss/crossentropy": 2.2262455224990845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091647908091545, "step": 15176 }, { "epoch": 0.30356, "grad_norm": 2.140625, "grad_norm_var": 0.04183349609375, "learning_rate": 0.0001, "loss": 4.2439, "loss/crossentropy": 2.4126710891723633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22972645610570908, "step": 15178 }, { "epoch": 0.3036, "grad_norm": 2.078125, "grad_norm_var": 0.042569732666015624, "learning_rate": 0.0001, "loss": 4.4268, "loss/crossentropy": 2.4708153009414673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23798953741788864, "step": 15180 }, { "epoch": 0.30364, "grad_norm": 2.109375, "grad_norm_var": 0.04053929646809896, "learning_rate": 0.0001, "loss": 4.1841, "loss/crossentropy": 1.9924429655075073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19149084389209747, "step": 15182 }, { "epoch": 0.30368, "grad_norm": 1.953125, "grad_norm_var": 0.04182103474934896, "learning_rate": 0.0001, "loss": 4.1335, "loss/crossentropy": 1.8018346428871155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1883997619152069, "step": 15184 }, { "epoch": 0.30372, "grad_norm": 1.9296875, "grad_norm_var": 0.04639867146809896, "learning_rate": 0.0001, "loss": 3.96, "loss/crossentropy": 2.001536011695862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008758783340454, "step": 15186 }, { "epoch": 0.30376, "grad_norm": 2.1875, "grad_norm_var": 0.06099828084309896, "learning_rate": 0.0001, "loss": 4.2308, "loss/crossentropy": 1.9239726066589355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068573236465454, "step": 15188 }, { "epoch": 0.3038, "grad_norm": 2.296875, "grad_norm_var": 0.059845987955729166, "learning_rate": 0.0001, "loss": 4.303, "loss/crossentropy": 2.252953827381134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22697407007217407, "step": 15190 }, { "epoch": 0.30384, "grad_norm": 1.921875, "grad_norm_var": 0.0631011962890625, "learning_rate": 0.0001, "loss": 4.174, "loss/crossentropy": 2.135131001472473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088206186890602, "step": 15192 }, { "epoch": 0.30388, "grad_norm": 2.09375, "grad_norm_var": 0.0391754150390625, "learning_rate": 0.0001, "loss": 4.22, "loss/crossentropy": 2.394876003265381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22017298638820648, "step": 15194 }, { "epoch": 0.30392, "grad_norm": 2.078125, "grad_norm_var": 0.03862711588541667, "learning_rate": 0.0001, "loss": 4.3951, "loss/crossentropy": 2.3517301082611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21459681540727615, "step": 15196 }, { "epoch": 0.30396, "grad_norm": 2.015625, "grad_norm_var": 0.038914998372395836, "learning_rate": 0.0001, "loss": 4.3068, "loss/crossentropy": 1.956869900226593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19348695874214172, "step": 15198 }, { "epoch": 0.304, "grad_norm": 1.9921875, "grad_norm_var": 0.045660146077473956, "learning_rate": 0.0001, "loss": 3.5481, "loss/crossentropy": 1.8349076509475708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19203926622867584, "step": 15200 }, { "epoch": 0.30404, "grad_norm": 2.140625, "grad_norm_var": 0.041265614827473956, "learning_rate": 0.0001, "loss": 4.1749, "loss/crossentropy": 2.253283977508545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24825416505336761, "step": 15202 }, { "epoch": 0.30408, "grad_norm": 1.9609375, "grad_norm_var": 0.0156646728515625, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 1.7219101190567017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18302495777606964, "step": 15204 }, { "epoch": 0.30412, "grad_norm": 2.046875, "grad_norm_var": 0.010309855143229166, "learning_rate": 0.0001, "loss": 4.4398, "loss/crossentropy": 2.2931004762649536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21909544616937637, "step": 15206 }, { "epoch": 0.30416, "grad_norm": 2.125, "grad_norm_var": 0.009883626302083334, "learning_rate": 0.0001, "loss": 4.0707, "loss/crossentropy": 1.9331820011138916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19565977901220322, "step": 15208 }, { "epoch": 0.3042, "grad_norm": 2.015625, "grad_norm_var": 0.008568318684895833, "learning_rate": 0.0001, "loss": 4.0936, "loss/crossentropy": 1.9863171577453613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20089460909366608, "step": 15210 }, { "epoch": 0.30424, "grad_norm": 1.9765625, "grad_norm_var": 0.010945383707682292, "learning_rate": 0.0001, "loss": 4.3762, "loss/crossentropy": 2.326872229576111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20695248246192932, "step": 15212 }, { "epoch": 0.30428, "grad_norm": 2.171875, "grad_norm_var": 0.012679036458333333, "learning_rate": 0.0001, "loss": 4.1684, "loss/crossentropy": 2.0216678380966187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19882218539714813, "step": 15214 }, { "epoch": 0.30432, "grad_norm": 2.171875, "grad_norm_var": 0.0074503580729166664, "learning_rate": 0.0001, "loss": 4.3682, "loss/crossentropy": 2.170402765274048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22156177461147308, "step": 15216 }, { "epoch": 0.30436, "grad_norm": 2.03125, "grad_norm_var": 0.007608795166015625, "learning_rate": 0.0001, "loss": 4.1655, "loss/crossentropy": 1.9652912616729736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20658399909734726, "step": 15218 }, { "epoch": 0.3044, "grad_norm": 1.90625, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 4.1234, "loss/crossentropy": 2.4518260955810547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20796608924865723, "step": 15220 }, { "epoch": 0.30444, "grad_norm": 1.984375, "grad_norm_var": 0.009952545166015625, "learning_rate": 0.0001, "loss": 4.0179, "loss/crossentropy": 1.9519163370132446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19269376248121262, "step": 15222 }, { "epoch": 0.30448, "grad_norm": 1.9375, "grad_norm_var": 0.010758209228515624, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 2.1493905782699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19713661074638367, "step": 15224 }, { "epoch": 0.30452, "grad_norm": 2.5, "grad_norm_var": 0.026878865559895833, "learning_rate": 0.0001, "loss": 4.2124, "loss/crossentropy": 2.2856662273406982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23240232467651367, "step": 15226 }, { "epoch": 0.30456, "grad_norm": 1.9140625, "grad_norm_var": 0.0249176025390625, "learning_rate": 0.0001, "loss": 4.1723, "loss/crossentropy": 2.3631181716918945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22731658816337585, "step": 15228 }, { "epoch": 0.3046, "grad_norm": 2.015625, "grad_norm_var": 0.027469635009765625, "learning_rate": 0.0001, "loss": 4.0952, "loss/crossentropy": 2.1270273327827454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736727863550186, "step": 15230 }, { "epoch": 0.30464, "grad_norm": 2.09375, "grad_norm_var": 0.029320271809895833, "learning_rate": 0.0001, "loss": 4.4801, "loss/crossentropy": 2.1448079347610474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20350141823291779, "step": 15232 }, { "epoch": 0.30468, "grad_norm": 2.71875, "grad_norm_var": 0.059576161702473956, "learning_rate": 0.0001, "loss": 4.295, "loss/crossentropy": 2.0582846999168396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19898280501365662, "step": 15234 }, { "epoch": 0.30472, "grad_norm": 2.078125, "grad_norm_var": 0.0597076416015625, "learning_rate": 0.0001, "loss": 4.0946, "loss/crossentropy": 2.2282591462135315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22299205511808395, "step": 15236 }, { "epoch": 0.30476, "grad_norm": 4.34375, "grad_norm_var": 0.3757484436035156, "learning_rate": 0.0001, "loss": 4.4031, "loss/crossentropy": 1.902436077594757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19752992689609528, "step": 15238 }, { "epoch": 0.3048, "grad_norm": 2.09375, "grad_norm_var": 0.3626177469889323, "learning_rate": 0.0001, "loss": 4.5476, "loss/crossentropy": 2.5741195678710938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25522060692310333, "step": 15240 }, { "epoch": 0.30484, "grad_norm": 2.078125, "grad_norm_var": 0.3576942443847656, "learning_rate": 0.0001, "loss": 4.2319, "loss/crossentropy": 2.261489987373352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2379501387476921, "step": 15242 }, { "epoch": 0.30488, "grad_norm": 1.921875, "grad_norm_var": 0.3581451416015625, "learning_rate": 0.0001, "loss": 4.1728, "loss/crossentropy": 2.3075523376464844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830804646015167, "step": 15244 }, { "epoch": 0.30492, "grad_norm": 1.9375, "grad_norm_var": 0.36470133463541665, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 2.096464157104492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073582485318184, "step": 15246 }, { "epoch": 0.30496, "grad_norm": 2.078125, "grad_norm_var": 0.41076558430989585, "learning_rate": 0.0001, "loss": 4.0121, "loss/crossentropy": 1.9624161124229431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19424450397491455, "step": 15248 }, { "epoch": 0.305, "grad_norm": 2.0625, "grad_norm_var": 0.38983154296875, "learning_rate": 0.0001, "loss": 4.1351, "loss/crossentropy": 2.0838447213172913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22090255469083786, "step": 15250 }, { "epoch": 0.30504, "grad_norm": 2.171875, "grad_norm_var": 0.379644521077474, "learning_rate": 0.0001, "loss": 4.4244, "loss/crossentropy": 2.349258542060852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20451825857162476, "step": 15252 }, { "epoch": 0.30508, "grad_norm": 2.09375, "grad_norm_var": 0.06938451131184896, "learning_rate": 0.0001, "loss": 4.328, "loss/crossentropy": 2.3996351957321167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24550051987171173, "step": 15254 }, { "epoch": 0.30512, "grad_norm": 1.859375, "grad_norm_var": 0.07430191040039062, "learning_rate": 0.0001, "loss": 4.0098, "loss/crossentropy": 2.325650215148926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2189955934882164, "step": 15256 }, { "epoch": 0.30516, "grad_norm": 2.015625, "grad_norm_var": 0.07815348307291667, "learning_rate": 0.0001, "loss": 4.0063, "loss/crossentropy": 1.8568453788757324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18639766424894333, "step": 15258 }, { "epoch": 0.3052, "grad_norm": 1.875, "grad_norm_var": 0.08263346354166666, "learning_rate": 0.0001, "loss": 3.8955, "loss/crossentropy": 1.8268811106681824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18669788539409637, "step": 15260 }, { "epoch": 0.30524, "grad_norm": 2.125, "grad_norm_var": 0.08479715983072916, "learning_rate": 0.0001, "loss": 4.1124, "loss/crossentropy": 2.0555724501609802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101765051484108, "step": 15262 }, { "epoch": 0.30528, "grad_norm": 1.9296875, "grad_norm_var": 0.018070475260416666, "learning_rate": 0.0001, "loss": 4.1075, "loss/crossentropy": 2.080612599849701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18955913186073303, "step": 15264 }, { "epoch": 0.30532, "grad_norm": 1.890625, "grad_norm_var": 0.0137939453125, "learning_rate": 0.0001, "loss": 4.2678, "loss/crossentropy": 2.352793037891388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21857291460037231, "step": 15266 }, { "epoch": 0.30536, "grad_norm": 1.8671875, "grad_norm_var": 0.010434722900390625, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 2.0258968472480774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18990560621023178, "step": 15268 }, { "epoch": 0.3054, "grad_norm": 1.8046875, "grad_norm_var": 0.010871378580729167, "learning_rate": 0.0001, "loss": 4.3126, "loss/crossentropy": 2.228433310985565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20862557739019394, "step": 15270 }, { "epoch": 0.30544, "grad_norm": 1.9765625, "grad_norm_var": 0.012330881754557292, "learning_rate": 0.0001, "loss": 3.7731, "loss/crossentropy": 1.9475300312042236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20644760876893997, "step": 15272 }, { "epoch": 0.30548, "grad_norm": 1.9453125, "grad_norm_var": 0.012166341145833334, "learning_rate": 0.0001, "loss": 4.1685, "loss/crossentropy": 2.3502081632614136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22096982598304749, "step": 15274 }, { "epoch": 0.30552, "grad_norm": 2.09375, "grad_norm_var": 0.012064615885416666, "learning_rate": 0.0001, "loss": 4.294, "loss/crossentropy": 2.2581464052200317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21158118546009064, "step": 15276 }, { "epoch": 0.30556, "grad_norm": 2.171875, "grad_norm_var": 0.014054361979166667, "learning_rate": 0.0001, "loss": 4.0301, "loss/crossentropy": 1.9583097696304321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19379162788391113, "step": 15278 }, { "epoch": 0.3056, "grad_norm": 2.109375, "grad_norm_var": 0.016527303059895835, "learning_rate": 0.0001, "loss": 4.4464, "loss/crossentropy": 1.988980233669281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19495020806789398, "step": 15280 }, { "epoch": 0.30564, "grad_norm": 1.984375, "grad_norm_var": 0.017350260416666666, "learning_rate": 0.0001, "loss": 4.1733, "loss/crossentropy": 2.1816195249557495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2196531891822815, "step": 15282 }, { "epoch": 0.30568, "grad_norm": 1.9375, "grad_norm_var": 0.015264638264973958, "learning_rate": 0.0001, "loss": 3.7994, "loss/crossentropy": 2.180557370185852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106764316558838, "step": 15284 }, { "epoch": 0.30572, "grad_norm": 2.046875, "grad_norm_var": 0.011027018229166666, "learning_rate": 0.0001, "loss": 4.0655, "loss/crossentropy": 1.901601493358612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18202877044677734, "step": 15286 }, { "epoch": 0.30576, "grad_norm": 2.0625, "grad_norm_var": 0.019301096598307293, "learning_rate": 0.0001, "loss": 4.1989, "loss/crossentropy": 2.259449601173401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23166191577911377, "step": 15288 }, { "epoch": 0.3058, "grad_norm": 1.90625, "grad_norm_var": 0.020514933268229167, "learning_rate": 0.0001, "loss": 3.9866, "loss/crossentropy": 1.786646544933319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19156906753778458, "step": 15290 }, { "epoch": 0.30584, "grad_norm": 1.9453125, "grad_norm_var": 0.02205174763997396, "learning_rate": 0.0001, "loss": 3.9258, "loss/crossentropy": 1.8522255420684814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828666776418686, "step": 15292 }, { "epoch": 0.30588, "grad_norm": 1.9765625, "grad_norm_var": 0.02127685546875, "learning_rate": 0.0001, "loss": 4.2543, "loss/crossentropy": 2.195667266845703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21043212711811066, "step": 15294 }, { "epoch": 0.30592, "grad_norm": 2.0, "grad_norm_var": 0.020918528238932293, "learning_rate": 0.0001, "loss": 3.8389, "loss/crossentropy": 1.8748087882995605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20358053594827652, "step": 15296 }, { "epoch": 0.30596, "grad_norm": 1.9453125, "grad_norm_var": 0.017536417643229166, "learning_rate": 0.0001, "loss": 4.0789, "loss/crossentropy": 2.2658244371414185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206317037343979, "step": 15298 }, { "epoch": 0.306, "grad_norm": 2.140625, "grad_norm_var": 0.018778483072916668, "learning_rate": 0.0001, "loss": 4.2882, "loss/crossentropy": 1.7914378643035889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20813927054405212, "step": 15300 }, { "epoch": 0.30604, "grad_norm": 2.109375, "grad_norm_var": 0.01895319620768229, "learning_rate": 0.0001, "loss": 4.1428, "loss/crossentropy": 2.046573519706726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20681843161582947, "step": 15302 }, { "epoch": 0.30608, "grad_norm": 2.03125, "grad_norm_var": 0.005960845947265625, "learning_rate": 0.0001, "loss": 4.2538, "loss/crossentropy": 2.0068886280059814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.201521098613739, "step": 15304 }, { "epoch": 0.30612, "grad_norm": 2.03125, "grad_norm_var": 0.005509440104166667, "learning_rate": 0.0001, "loss": 4.0672, "loss/crossentropy": 2.2020740509033203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20413114875555038, "step": 15306 }, { "epoch": 0.30616, "grad_norm": 2.09375, "grad_norm_var": 0.0057769775390625, "learning_rate": 0.0001, "loss": 4.3654, "loss/crossentropy": 2.3031119108200073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20878810435533524, "step": 15308 }, { "epoch": 0.3062, "grad_norm": 1.9296875, "grad_norm_var": 0.006204986572265625, "learning_rate": 0.0001, "loss": 3.7786, "loss/crossentropy": 1.6445855498313904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1618219092488289, "step": 15310 }, { "epoch": 0.30624, "grad_norm": 2.046875, "grad_norm_var": 0.004572550455729167, "learning_rate": 0.0001, "loss": 4.1378, "loss/crossentropy": 1.995127022266388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21076080948114395, "step": 15312 }, { "epoch": 0.30628, "grad_norm": 1.90625, "grad_norm_var": 0.005411783854166667, "learning_rate": 0.0001, "loss": 3.6278, "loss/crossentropy": 1.8340229392051697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418048858642578, "step": 15314 }, { "epoch": 0.30632, "grad_norm": 2.0625, "grad_norm_var": 0.007106272379557291, "learning_rate": 0.0001, "loss": 3.9496, "loss/crossentropy": 1.713787317276001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17608212679624557, "step": 15316 }, { "epoch": 0.30636, "grad_norm": 2.0625, "grad_norm_var": 0.006967926025390625, "learning_rate": 0.0001, "loss": 4.0223, "loss/crossentropy": 1.7105762362480164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1760871261358261, "step": 15318 }, { "epoch": 0.3064, "grad_norm": 1.9375, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 4.0326, "loss/crossentropy": 2.337005376815796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222321778535843, "step": 15320 }, { "epoch": 0.30644, "grad_norm": 2.046875, "grad_norm_var": 0.008868153889973958, "learning_rate": 0.0001, "loss": 4.3269, "loss/crossentropy": 1.8426868915557861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1880369558930397, "step": 15322 }, { "epoch": 0.30648, "grad_norm": 1.9765625, "grad_norm_var": 0.009162394205729167, "learning_rate": 0.0001, "loss": 3.7798, "loss/crossentropy": 2.300028443336487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22529813647270203, "step": 15324 }, { "epoch": 0.30652, "grad_norm": 1.953125, "grad_norm_var": 0.009193674723307291, "learning_rate": 0.0001, "loss": 3.9838, "loss/crossentropy": 2.3558120727539062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2000514343380928, "step": 15326 }, { "epoch": 0.30656, "grad_norm": 1.8828125, "grad_norm_var": 0.007879384358723958, "learning_rate": 0.0001, "loss": 3.6921, "loss/crossentropy": 1.681145966053009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756032332777977, "step": 15328 }, { "epoch": 0.3066, "grad_norm": 2.0, "grad_norm_var": 0.010298665364583333, "learning_rate": 0.0001, "loss": 4.41, "loss/crossentropy": 2.1566712260246277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21067694574594498, "step": 15330 }, { "epoch": 0.30664, "grad_norm": 1.9765625, "grad_norm_var": 0.0075347900390625, "learning_rate": 0.0001, "loss": 3.993, "loss/crossentropy": 2.125000774860382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20768606662750244, "step": 15332 }, { "epoch": 0.30668, "grad_norm": 1.9921875, "grad_norm_var": 0.007328033447265625, "learning_rate": 0.0001, "loss": 4.0805, "loss/crossentropy": 2.168649673461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20393794029951096, "step": 15334 }, { "epoch": 0.30672, "grad_norm": 2.09375, "grad_norm_var": 0.008072662353515624, "learning_rate": 0.0001, "loss": 3.9154, "loss/crossentropy": 1.657529592514038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19351129233837128, "step": 15336 }, { "epoch": 0.30676, "grad_norm": 2.0625, "grad_norm_var": 0.006780751546223958, "learning_rate": 0.0001, "loss": 4.2166, "loss/crossentropy": 2.115979850292206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20223377645015717, "step": 15338 }, { "epoch": 0.3068, "grad_norm": 1.9921875, "grad_norm_var": 0.005833943684895833, "learning_rate": 0.0001, "loss": 4.1288, "loss/crossentropy": 1.699661135673523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17160461097955704, "step": 15340 }, { "epoch": 0.30684, "grad_norm": 1.9453125, "grad_norm_var": 0.008180491129557292, "learning_rate": 0.0001, "loss": 3.8418, "loss/crossentropy": 2.141602098941803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19209860265254974, "step": 15342 }, { "epoch": 0.30688, "grad_norm": 2.0625, "grad_norm_var": 0.007806142171223958, "learning_rate": 0.0001, "loss": 4.3619, "loss/crossentropy": 2.1502809524536133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21510163694620132, "step": 15344 }, { "epoch": 0.30692, "grad_norm": 1.984375, "grad_norm_var": 0.006078847249348958, "learning_rate": 0.0001, "loss": 4.0684, "loss/crossentropy": 2.1354172825813293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19959942996501923, "step": 15346 }, { "epoch": 0.30696, "grad_norm": 2.015625, "grad_norm_var": 0.006095377604166666, "learning_rate": 0.0001, "loss": 4.1196, "loss/crossentropy": 2.0634223222732544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073688805103302, "step": 15348 }, { "epoch": 0.307, "grad_norm": 2.125, "grad_norm_var": 0.008385976155598959, "learning_rate": 0.0001, "loss": 4.1489, "loss/crossentropy": 1.9851733446121216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2060396894812584, "step": 15350 }, { "epoch": 0.30704, "grad_norm": 1.9921875, "grad_norm_var": 0.007490793863932292, "learning_rate": 0.0001, "loss": 4.1196, "loss/crossentropy": 2.3790550231933594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23205619305372238, "step": 15352 }, { "epoch": 0.30708, "grad_norm": 1.8984375, "grad_norm_var": 0.008552042643229167, "learning_rate": 0.0001, "loss": 3.9085, "loss/crossentropy": 1.943642258644104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18238085508346558, "step": 15354 }, { "epoch": 0.30712, "grad_norm": 2.0, "grad_norm_var": 0.008988189697265624, "learning_rate": 0.0001, "loss": 4.112, "loss/crossentropy": 1.9680217504501343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18599005788564682, "step": 15356 }, { "epoch": 0.30716, "grad_norm": 1.890625, "grad_norm_var": 0.0077898661295572914, "learning_rate": 0.0001, "loss": 3.7402, "loss/crossentropy": 1.9464871287345886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18814007937908173, "step": 15358 }, { "epoch": 0.3072, "grad_norm": 2.265625, "grad_norm_var": 0.013179524739583334, "learning_rate": 0.0001, "loss": 4.0231, "loss/crossentropy": 2.0038596987724304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19203800708055496, "step": 15360 }, { "epoch": 0.30724, "grad_norm": 1.9921875, "grad_norm_var": 0.013071441650390625, "learning_rate": 0.0001, "loss": 4.0843, "loss/crossentropy": 1.93240225315094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20595912635326385, "step": 15362 }, { "epoch": 0.30728, "grad_norm": 2.0, "grad_norm_var": 0.012938435872395833, "learning_rate": 0.0001, "loss": 4.2042, "loss/crossentropy": 2.067903220653534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503040939569473, "step": 15364 }, { "epoch": 0.30732, "grad_norm": 2.140625, "grad_norm_var": 0.012674713134765625, "learning_rate": 0.0001, "loss": 3.9056, "loss/crossentropy": 2.179181218147278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20675741881132126, "step": 15366 }, { "epoch": 0.30736, "grad_norm": 2.078125, "grad_norm_var": 0.02471923828125, "learning_rate": 0.0001, "loss": 4.4657, "loss/crossentropy": 2.106004774570465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23683416098356247, "step": 15368 }, { "epoch": 0.3074, "grad_norm": 2.0, "grad_norm_var": 0.023266347249348958, "learning_rate": 0.0001, "loss": 4.135, "loss/crossentropy": 2.2843246459960938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24139763414859772, "step": 15370 }, { "epoch": 0.30744, "grad_norm": 2.0, "grad_norm_var": 0.028484853108723958, "learning_rate": 0.0001, "loss": 4.122, "loss/crossentropy": 2.2482924461364746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2242894545197487, "step": 15372 }, { "epoch": 0.30748, "grad_norm": 1.9765625, "grad_norm_var": 0.025321451822916667, "learning_rate": 0.0001, "loss": 4.221, "loss/crossentropy": 2.1573593616485596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22176649421453476, "step": 15374 }, { "epoch": 0.30752, "grad_norm": 1.8828125, "grad_norm_var": 0.022705078125, "learning_rate": 0.0001, "loss": 4.219, "loss/crossentropy": 1.9243710041046143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19137389957904816, "step": 15376 }, { "epoch": 0.30756, "grad_norm": 1.984375, "grad_norm_var": 0.02211481730143229, "learning_rate": 0.0001, "loss": 4.0903, "loss/crossentropy": 2.004511833190918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20657775551080704, "step": 15378 }, { "epoch": 0.3076, "grad_norm": 1.859375, "grad_norm_var": 0.022956339518229167, "learning_rate": 0.0001, "loss": 4.0174, "loss/crossentropy": 1.8409560918807983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19139550626277924, "step": 15380 }, { "epoch": 0.30764, "grad_norm": 2.09375, "grad_norm_var": 0.019974772135416666, "learning_rate": 0.0001, "loss": 4.2722, "loss/crossentropy": 2.1442220211029053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23278377205133438, "step": 15382 }, { "epoch": 0.30768, "grad_norm": 1.9765625, "grad_norm_var": 0.013651275634765625, "learning_rate": 0.0001, "loss": 4.2915, "loss/crossentropy": 2.086996078491211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20602282136678696, "step": 15384 }, { "epoch": 0.30772, "grad_norm": 2.125, "grad_norm_var": 0.014404042561848959, "learning_rate": 0.0001, "loss": 4.4884, "loss/crossentropy": 2.479863405227661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23815791308879852, "step": 15386 }, { "epoch": 0.30776, "grad_norm": 1.953125, "grad_norm_var": 0.009034983317057292, "learning_rate": 0.0001, "loss": 4.0648, "loss/crossentropy": 1.9218324422836304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22571419924497604, "step": 15388 }, { "epoch": 0.3078, "grad_norm": 2.078125, "grad_norm_var": 0.009004720052083333, "learning_rate": 0.0001, "loss": 4.0315, "loss/crossentropy": 1.9488168954849243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004484310746193, "step": 15390 }, { "epoch": 0.30784, "grad_norm": 2.09375, "grad_norm_var": 0.006585439046223958, "learning_rate": 0.0001, "loss": 4.3221, "loss/crossentropy": 2.307602286338806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23881366103887558, "step": 15392 }, { "epoch": 0.30788, "grad_norm": 1.9609375, "grad_norm_var": 0.007283528645833333, "learning_rate": 0.0001, "loss": 4.1025, "loss/crossentropy": 2.1380842328071594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019854038953781, "step": 15394 }, { "epoch": 0.30792, "grad_norm": 2.125, "grad_norm_var": 0.0072509765625, "learning_rate": 0.0001, "loss": 4.1245, "loss/crossentropy": 2.0551159977912903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18669020384550095, "step": 15396 }, { "epoch": 0.30796, "grad_norm": 1.8828125, "grad_norm_var": 0.009250640869140625, "learning_rate": 0.0001, "loss": 3.959, "loss/crossentropy": 1.9213807582855225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843273714184761, "step": 15398 }, { "epoch": 0.308, "grad_norm": 2.015625, "grad_norm_var": 0.008628082275390626, "learning_rate": 0.0001, "loss": 3.9724, "loss/crossentropy": 2.0028855204582214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215728610754013, "step": 15400 }, { "epoch": 0.30804, "grad_norm": 2.0, "grad_norm_var": 0.0064776102701822914, "learning_rate": 0.0001, "loss": 4.0287, "loss/crossentropy": 1.957477331161499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727651566267014, "step": 15402 }, { "epoch": 0.30808, "grad_norm": 1.8515625, "grad_norm_var": 0.0080718994140625, "learning_rate": 0.0001, "loss": 4.0255, "loss/crossentropy": 2.2241835594177246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25234874337911606, "step": 15404 }, { "epoch": 0.30812, "grad_norm": 2.046875, "grad_norm_var": 0.006852213541666667, "learning_rate": 0.0001, "loss": 4.2677, "loss/crossentropy": 2.108901560306549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20304366201162338, "step": 15406 }, { "epoch": 0.30816, "grad_norm": 2.015625, "grad_norm_var": 0.0075032552083333336, "learning_rate": 0.0001, "loss": 4.302, "loss/crossentropy": 2.1005085110664368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21205321699380875, "step": 15408 }, { "epoch": 0.3082, "grad_norm": 2.03125, "grad_norm_var": 0.008139801025390626, "learning_rate": 0.0001, "loss": 4.2505, "loss/crossentropy": 2.1565412282943726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22271832078695297, "step": 15410 }, { "epoch": 0.30824, "grad_norm": 2.015625, "grad_norm_var": 0.006241607666015625, "learning_rate": 0.0001, "loss": 4.1236, "loss/crossentropy": 1.906798243522644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21911519020795822, "step": 15412 }, { "epoch": 0.30828, "grad_norm": 1.8828125, "grad_norm_var": 0.006089019775390625, "learning_rate": 0.0001, "loss": 4.1679, "loss/crossentropy": 2.0200547575950623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19892341643571854, "step": 15414 }, { "epoch": 0.30832, "grad_norm": 2.03125, "grad_norm_var": 0.006037394205729167, "learning_rate": 0.0001, "loss": 4.302, "loss/crossentropy": 2.14735209941864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503761410713196, "step": 15416 }, { "epoch": 0.30836, "grad_norm": 2.078125, "grad_norm_var": 0.007405344645182292, "learning_rate": 0.0001, "loss": 4.2943, "loss/crossentropy": 2.0479459166526794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21268057823181152, "step": 15418 }, { "epoch": 0.3084, "grad_norm": 2.015625, "grad_norm_var": 0.006831614176432291, "learning_rate": 0.0001, "loss": 4.1969, "loss/crossentropy": 1.4835808873176575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16378673166036606, "step": 15420 }, { "epoch": 0.30844, "grad_norm": 1.984375, "grad_norm_var": 0.007802073160807292, "learning_rate": 0.0001, "loss": 4.058, "loss/crossentropy": 1.7942256331443787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18387632071971893, "step": 15422 }, { "epoch": 0.30848, "grad_norm": 2.046875, "grad_norm_var": 0.006534576416015625, "learning_rate": 0.0001, "loss": 4.1352, "loss/crossentropy": 1.8660341501235962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20189601182937622, "step": 15424 }, { "epoch": 0.30852, "grad_norm": 1.9453125, "grad_norm_var": 0.006257120768229167, "learning_rate": 0.0001, "loss": 4.0359, "loss/crossentropy": 1.8545112013816833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21422524750232697, "step": 15426 }, { "epoch": 0.30856, "grad_norm": 1.890625, "grad_norm_var": 0.0069163004557291664, "learning_rate": 0.0001, "loss": 4.1411, "loss/crossentropy": 2.204701781272888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21733464300632477, "step": 15428 }, { "epoch": 0.3086, "grad_norm": 1.9609375, "grad_norm_var": 0.006087239583333333, "learning_rate": 0.0001, "loss": 4.0865, "loss/crossentropy": 1.9295769929885864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20931877195835114, "step": 15430 }, { "epoch": 0.30864, "grad_norm": 2.140625, "grad_norm_var": 0.008534495035807292, "learning_rate": 0.0001, "loss": 3.9165, "loss/crossentropy": 2.0090243220329285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20928232371807098, "step": 15432 }, { "epoch": 0.30868, "grad_norm": 2.0625, "grad_norm_var": 0.009326171875, "learning_rate": 0.0001, "loss": 4.0517, "loss/crossentropy": 2.1884257793426514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20887935161590576, "step": 15434 }, { "epoch": 0.30872, "grad_norm": 1.8828125, "grad_norm_var": 0.009553019205729167, "learning_rate": 0.0001, "loss": 4.0101, "loss/crossentropy": 2.023981511592865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893448904156685, "step": 15436 }, { "epoch": 0.30876, "grad_norm": 1.890625, "grad_norm_var": 0.00909423828125, "learning_rate": 0.0001, "loss": 4.0168, "loss/crossentropy": 2.427197813987732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21649108827114105, "step": 15438 }, { "epoch": 0.3088, "grad_norm": 2.3125, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 4.1243, "loss/crossentropy": 2.2578816413879395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21297947317361832, "step": 15440 }, { "epoch": 0.30884, "grad_norm": 2.0625, "grad_norm_var": 0.015248362223307292, "learning_rate": 0.0001, "loss": 4.2977, "loss/crossentropy": 1.8909358382225037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924509033560753, "step": 15442 }, { "epoch": 0.30888, "grad_norm": 2.046875, "grad_norm_var": 0.016228993733723957, "learning_rate": 0.0001, "loss": 4.0142, "loss/crossentropy": 2.344139575958252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20414643734693527, "step": 15444 }, { "epoch": 0.30892, "grad_norm": 1.96875, "grad_norm_var": 0.01624755859375, "learning_rate": 0.0001, "loss": 4.2373, "loss/crossentropy": 2.268660068511963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22332928329706192, "step": 15446 }, { "epoch": 0.30896, "grad_norm": 1.9765625, "grad_norm_var": 0.0148193359375, "learning_rate": 0.0001, "loss": 4.0944, "loss/crossentropy": 2.1276373267173767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123691812157631, "step": 15448 }, { "epoch": 0.309, "grad_norm": 1.8828125, "grad_norm_var": 0.012975819905598958, "learning_rate": 0.0001, "loss": 4.1025, "loss/crossentropy": 2.2471970319747925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2228669673204422, "step": 15450 }, { "epoch": 0.30904, "grad_norm": 2.0625, "grad_norm_var": 0.0124908447265625, "learning_rate": 0.0001, "loss": 4.2153, "loss/crossentropy": 2.122064530849457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22981297969818115, "step": 15452 }, { "epoch": 0.30908, "grad_norm": 1.7734375, "grad_norm_var": 0.014823150634765626, "learning_rate": 0.0001, "loss": 3.815, "loss/crossentropy": 2.016224443912506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19105292856693268, "step": 15454 }, { "epoch": 0.30912, "grad_norm": 1.8359375, "grad_norm_var": 0.016193644205729166, "learning_rate": 0.0001, "loss": 4.1518, "loss/crossentropy": 1.9021474719047546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23519159853458405, "step": 15456 }, { "epoch": 0.30916, "grad_norm": 2.09375, "grad_norm_var": 0.016617838541666666, "learning_rate": 0.0001, "loss": 4.0971, "loss/crossentropy": 2.067293703556061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20525048673152924, "step": 15458 }, { "epoch": 0.3092, "grad_norm": 2.3125, "grad_norm_var": 0.021930948893229166, "learning_rate": 0.0001, "loss": 4.3147, "loss/crossentropy": 2.0720648169517517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20188772678375244, "step": 15460 }, { "epoch": 0.30924, "grad_norm": 2.015625, "grad_norm_var": 0.030890909830729167, "learning_rate": 0.0001, "loss": 4.212, "loss/crossentropy": 2.1377363204956055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20943745225667953, "step": 15462 }, { "epoch": 0.30928, "grad_norm": 2.078125, "grad_norm_var": 0.028595987955729166, "learning_rate": 0.0001, "loss": 4.0799, "loss/crossentropy": 1.9101733565330505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18436391651630402, "step": 15464 }, { "epoch": 0.30932, "grad_norm": 2.015625, "grad_norm_var": 0.026775868733723958, "learning_rate": 0.0001, "loss": 4.0945, "loss/crossentropy": 2.205660581588745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2431517392396927, "step": 15466 }, { "epoch": 0.30936, "grad_norm": 2.078125, "grad_norm_var": 0.02670262654622396, "learning_rate": 0.0001, "loss": 4.502, "loss/crossentropy": 2.3487337827682495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22333069890737534, "step": 15468 }, { "epoch": 0.3094, "grad_norm": 1.9609375, "grad_norm_var": 0.021971638997395834, "learning_rate": 0.0001, "loss": 3.9717, "loss/crossentropy": 1.953830897808075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20369011163711548, "step": 15470 }, { "epoch": 0.30944, "grad_norm": 2.0625, "grad_norm_var": 0.0156402587890625, "learning_rate": 0.0001, "loss": 4.2282, "loss/crossentropy": 2.1175807118415833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23428738862276077, "step": 15472 }, { "epoch": 0.30948, "grad_norm": 1.9375, "grad_norm_var": 0.016585032145182293, "learning_rate": 0.0001, "loss": 4.0257, "loss/crossentropy": 2.114426612854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20113752037286758, "step": 15474 }, { "epoch": 0.30952, "grad_norm": 2.03125, "grad_norm_var": 0.01676203409830729, "learning_rate": 0.0001, "loss": 3.8239, "loss/crossentropy": 2.121155083179474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23428727686405182, "step": 15476 }, { "epoch": 0.30956, "grad_norm": 1.9375, "grad_norm_var": 0.007061513264973959, "learning_rate": 0.0001, "loss": 3.9982, "loss/crossentropy": 2.1271342635154724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993299275636673, "step": 15478 }, { "epoch": 0.3096, "grad_norm": 1.921875, "grad_norm_var": 0.007502237955729167, "learning_rate": 0.0001, "loss": 4.0211, "loss/crossentropy": 2.132619023323059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20428313314914703, "step": 15480 }, { "epoch": 0.30964, "grad_norm": 2.296875, "grad_norm_var": 0.013720703125, "learning_rate": 0.0001, "loss": 4.2644, "loss/crossentropy": 1.8355163931846619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969365030527115, "step": 15482 }, { "epoch": 0.30968, "grad_norm": 2.21875, "grad_norm_var": 0.016695149739583335, "learning_rate": 0.0001, "loss": 4.0001, "loss/crossentropy": 1.6765839457511902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19648748636245728, "step": 15484 }, { "epoch": 0.30972, "grad_norm": 2.03125, "grad_norm_var": 0.0168609619140625, "learning_rate": 0.0001, "loss": 4.0806, "loss/crossentropy": 1.7628436088562012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1850176900625229, "step": 15486 }, { "epoch": 0.30976, "grad_norm": 2.125, "grad_norm_var": 0.017014312744140624, "learning_rate": 0.0001, "loss": 4.5902, "loss/crossentropy": 2.407894253730774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243446335196495, "step": 15488 }, { "epoch": 0.3098, "grad_norm": 1.8984375, "grad_norm_var": 0.017409006754557293, "learning_rate": 0.0001, "loss": 4.0274, "loss/crossentropy": 2.0359743237495422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140478640794754, "step": 15490 }, { "epoch": 0.30984, "grad_norm": 2.078125, "grad_norm_var": 0.012111155192057292, "learning_rate": 0.0001, "loss": 4.086, "loss/crossentropy": 2.1705892086029053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108132392168045, "step": 15492 }, { "epoch": 0.30988, "grad_norm": 1.984375, "grad_norm_var": 0.011126454671223958, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 1.836803376674652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19436977803707123, "step": 15494 }, { "epoch": 0.30992, "grad_norm": 2.015625, "grad_norm_var": 0.010432688395182292, "learning_rate": 0.0001, "loss": 4.1018, "loss/crossentropy": 2.1445683240890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21950577944517136, "step": 15496 }, { "epoch": 0.30996, "grad_norm": 2.09375, "grad_norm_var": 0.007299550374348958, "learning_rate": 0.0001, "loss": 4.1382, "loss/crossentropy": 2.258531093597412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212137870490551, "step": 15498 }, { "epoch": 0.31, "grad_norm": 1.96875, "grad_norm_var": 0.004052480061848958, "learning_rate": 0.0001, "loss": 4.2473, "loss/crossentropy": 1.9310460686683655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18316251039505005, "step": 15500 }, { "epoch": 0.31004, "grad_norm": 1.9375, "grad_norm_var": 0.004788970947265625, "learning_rate": 0.0001, "loss": 3.8656, "loss/crossentropy": 1.867807149887085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18478462100028992, "step": 15502 }, { "epoch": 0.31008, "grad_norm": 2.21875, "grad_norm_var": 0.008042144775390624, "learning_rate": 0.0001, "loss": 4.4691, "loss/crossentropy": 2.0724986791610718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20986932516098022, "step": 15504 }, { "epoch": 0.31012, "grad_norm": 2.203125, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 4.5188, "loss/crossentropy": 2.1599953174591064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2250317633152008, "step": 15506 }, { "epoch": 0.31016, "grad_norm": 1.8125, "grad_norm_var": 0.015608469645182291, "learning_rate": 0.0001, "loss": 3.7842, "loss/crossentropy": 1.975223183631897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104134038090706, "step": 15508 }, { "epoch": 0.3102, "grad_norm": 2.21875, "grad_norm_var": 0.019539388020833333, "learning_rate": 0.0001, "loss": 4.0028, "loss/crossentropy": 2.133773148059845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20153696089982986, "step": 15510 }, { "epoch": 0.31024, "grad_norm": 1.984375, "grad_norm_var": 0.02014948527018229, "learning_rate": 0.0001, "loss": 3.9948, "loss/crossentropy": 1.994078278541565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20573429763317108, "step": 15512 }, { "epoch": 0.31028, "grad_norm": 1.953125, "grad_norm_var": 0.019280751546223957, "learning_rate": 0.0001, "loss": 3.8209, "loss/crossentropy": 1.7885381579399109, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889629364013672, "step": 15514 }, { "epoch": 0.31032, "grad_norm": 2.0, "grad_norm_var": 0.019191233317057292, "learning_rate": 0.0001, "loss": 4.2765, "loss/crossentropy": 2.495113968849182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22110097110271454, "step": 15516 }, { "epoch": 0.31036, "grad_norm": 1.953125, "grad_norm_var": 0.019160715738932292, "learning_rate": 0.0001, "loss": 4.0465, "loss/crossentropy": 2.0757648944854736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20948036015033722, "step": 15518 }, { "epoch": 0.3104, "grad_norm": 1.9765625, "grad_norm_var": 0.015793609619140624, "learning_rate": 0.0001, "loss": 4.1369, "loss/crossentropy": 2.151356875896454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22598887234926224, "step": 15520 }, { "epoch": 0.31044, "grad_norm": 2.0, "grad_norm_var": 0.010933176676432291, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 2.1210484504699707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23365220427513123, "step": 15522 }, { "epoch": 0.31048, "grad_norm": 1.8984375, "grad_norm_var": 0.008414459228515626, "learning_rate": 0.0001, "loss": 4.0584, "loss/crossentropy": 2.3766754865646362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22161870449781418, "step": 15524 }, { "epoch": 0.31052, "grad_norm": 1.9375, "grad_norm_var": 0.0043853759765625, "learning_rate": 0.0001, "loss": 4.0968, "loss/crossentropy": 2.33975088596344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21598009765148163, "step": 15526 }, { "epoch": 0.31056, "grad_norm": 2.1875, "grad_norm_var": 0.006468709309895833, "learning_rate": 0.0001, "loss": 4.4643, "loss/crossentropy": 2.3024542331695557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20656803250312805, "step": 15528 }, { "epoch": 0.3106, "grad_norm": 2.078125, "grad_norm_var": 0.007666015625, "learning_rate": 0.0001, "loss": 4.1137, "loss/crossentropy": 2.009307861328125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19054396450519562, "step": 15530 }, { "epoch": 0.31064, "grad_norm": 1.90625, "grad_norm_var": 0.008902994791666667, "learning_rate": 0.0001, "loss": 3.9413, "loss/crossentropy": 1.8119140267372131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19199557602405548, "step": 15532 }, { "epoch": 0.31068, "grad_norm": 1.9140625, "grad_norm_var": 0.009614817301432292, "learning_rate": 0.0001, "loss": 3.8753, "loss/crossentropy": 1.9586234092712402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18981865793466568, "step": 15534 }, { "epoch": 0.31072, "grad_norm": 1.96875, "grad_norm_var": 0.010798899332682292, "learning_rate": 0.0001, "loss": 4.4319, "loss/crossentropy": 2.1208658814430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20590776205062866, "step": 15536 }, { "epoch": 0.31076, "grad_norm": 1.84375, "grad_norm_var": 0.012336222330729167, "learning_rate": 0.0001, "loss": 3.7806, "loss/crossentropy": 1.539306402206421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1615876704454422, "step": 15538 }, { "epoch": 0.3108, "grad_norm": 1.9921875, "grad_norm_var": 0.013768513997395834, "learning_rate": 0.0001, "loss": 4.0686, "loss/crossentropy": 2.1376689672470093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21377786993980408, "step": 15540 }, { "epoch": 0.31084, "grad_norm": 2.140625, "grad_norm_var": 0.016249338785807293, "learning_rate": 0.0001, "loss": 4.368, "loss/crossentropy": 1.954556941986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19270780682563782, "step": 15542 }, { "epoch": 0.31088, "grad_norm": 2.015625, "grad_norm_var": 0.013575998942057292, "learning_rate": 0.0001, "loss": 4.1313, "loss/crossentropy": 2.0225048661231995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937834918498993, "step": 15544 }, { "epoch": 0.31092, "grad_norm": 2.0625, "grad_norm_var": 0.011310831705729166, "learning_rate": 0.0001, "loss": 4.0919, "loss/crossentropy": 2.149065852165222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21962979435920715, "step": 15546 }, { "epoch": 0.31096, "grad_norm": 2.171875, "grad_norm_var": 0.013073476155598958, "learning_rate": 0.0001, "loss": 4.0192, "loss/crossentropy": 1.9546958804130554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20022059231996536, "step": 15548 }, { "epoch": 0.311, "grad_norm": 1.90625, "grad_norm_var": 0.013728841145833334, "learning_rate": 0.0001, "loss": 4.2815, "loss/crossentropy": 2.3048110008239746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232206016778946, "step": 15550 }, { "epoch": 0.31104, "grad_norm": 1.953125, "grad_norm_var": 0.014525349934895833, "learning_rate": 0.0001, "loss": 3.9263, "loss/crossentropy": 2.0329134464263916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20285522937774658, "step": 15552 }, { "epoch": 0.31108, "grad_norm": 2.0, "grad_norm_var": 0.013138580322265624, "learning_rate": 0.0001, "loss": 4.1654, "loss/crossentropy": 2.0910086631774902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20308854430913925, "step": 15554 }, { "epoch": 0.31112, "grad_norm": 1.96875, "grad_norm_var": 0.012361399332682292, "learning_rate": 0.0001, "loss": 4.1922, "loss/crossentropy": 2.3473485708236694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138630449771881, "step": 15556 }, { "epoch": 0.31116, "grad_norm": 1.984375, "grad_norm_var": 0.0098297119140625, "learning_rate": 0.0001, "loss": 4.3456, "loss/crossentropy": 2.4176318645477295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23624707758426666, "step": 15558 }, { "epoch": 0.3112, "grad_norm": 1.9609375, "grad_norm_var": 0.009943644205729166, "learning_rate": 0.0001, "loss": 4.0898, "loss/crossentropy": 2.190505266189575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21235676854848862, "step": 15560 }, { "epoch": 0.31124, "grad_norm": 2.28125, "grad_norm_var": 0.1935198465983073, "learning_rate": 0.0001, "loss": 4.3688, "loss/crossentropy": 2.1774531602859497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2568993717432022, "step": 15562 }, { "epoch": 0.31128, "grad_norm": 1.9609375, "grad_norm_var": 0.1934832255045573, "learning_rate": 0.0001, "loss": 4.004, "loss/crossentropy": 2.0320950150489807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21587081998586655, "step": 15564 }, { "epoch": 0.31132, "grad_norm": 2.125, "grad_norm_var": 0.19068781534830728, "learning_rate": 0.0001, "loss": 4.3037, "loss/crossentropy": 2.0477761030197144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21909283846616745, "step": 15566 }, { "epoch": 0.31136, "grad_norm": 2.09375, "grad_norm_var": 0.18454996744791666, "learning_rate": 0.0001, "loss": 4.2288, "loss/crossentropy": 2.3907222747802734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21334309875965118, "step": 15568 }, { "epoch": 0.3114, "grad_norm": 1.984375, "grad_norm_var": 0.1816973368326823, "learning_rate": 0.0001, "loss": 4.28, "loss/crossentropy": 2.2539732456207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21502045542001724, "step": 15570 }, { "epoch": 0.31144, "grad_norm": 1.96875, "grad_norm_var": 0.17766520182291667, "learning_rate": 0.0001, "loss": 3.9675, "loss/crossentropy": 2.036223292350769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959664672613144, "step": 15572 }, { "epoch": 0.31148, "grad_norm": 1.984375, "grad_norm_var": 0.17722880045572917, "learning_rate": 0.0001, "loss": 4.0387, "loss/crossentropy": 1.7659193873405457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19535844773054123, "step": 15574 }, { "epoch": 0.31152, "grad_norm": 2.015625, "grad_norm_var": 0.17946751912434897, "learning_rate": 0.0001, "loss": 3.8892, "loss/crossentropy": 2.0053369402885437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24516403675079346, "step": 15576 }, { "epoch": 0.31156, "grad_norm": 2.171875, "grad_norm_var": 0.0060384114583333336, "learning_rate": 0.0001, "loss": 3.9851, "loss/crossentropy": 2.1804131269454956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349890023469925, "step": 15578 }, { "epoch": 0.3116, "grad_norm": 1.9296875, "grad_norm_var": 0.008552042643229167, "learning_rate": 0.0001, "loss": 3.9369, "loss/crossentropy": 2.314085602760315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21073052287101746, "step": 15580 }, { "epoch": 0.31164, "grad_norm": 1.8203125, "grad_norm_var": 0.008973948160807292, "learning_rate": 0.0001, "loss": 3.7951, "loss/crossentropy": 2.1868577003479004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21520060300827026, "step": 15582 }, { "epoch": 0.31168, "grad_norm": 1.953125, "grad_norm_var": 0.007957967122395833, "learning_rate": 0.0001, "loss": 4.1618, "loss/crossentropy": 1.9785407185554504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18188606202602386, "step": 15584 }, { "epoch": 0.31172, "grad_norm": 2.109375, "grad_norm_var": 0.009598795572916667, "learning_rate": 0.0001, "loss": 3.9173, "loss/crossentropy": 1.9320110082626343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022753208875656, "step": 15586 }, { "epoch": 0.31176, "grad_norm": 2.28125, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 4.0995, "loss/crossentropy": 1.7925443649291992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17203447967767715, "step": 15588 }, { "epoch": 0.3118, "grad_norm": 1.9609375, "grad_norm_var": 0.016556803385416666, "learning_rate": 0.0001, "loss": 3.8876, "loss/crossentropy": 2.048543393611908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18696243315935135, "step": 15590 }, { "epoch": 0.31184, "grad_norm": 1.921875, "grad_norm_var": 0.01658935546875, "learning_rate": 0.0001, "loss": 4.125, "loss/crossentropy": 2.376060724258423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20351988822221756, "step": 15592 }, { "epoch": 0.31188, "grad_norm": 2.171875, "grad_norm_var": 0.0164947509765625, "learning_rate": 0.0001, "loss": 4.1778, "loss/crossentropy": 1.5962707996368408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1538907140493393, "step": 15594 }, { "epoch": 0.31192, "grad_norm": 1.9453125, "grad_norm_var": 0.016281890869140624, "learning_rate": 0.0001, "loss": 3.7529, "loss/crossentropy": 1.7845428586006165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18107154965400696, "step": 15596 }, { "epoch": 0.31196, "grad_norm": 2.109375, "grad_norm_var": 0.01536865234375, "learning_rate": 0.0001, "loss": 4.0385, "loss/crossentropy": 1.8557100296020508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010527327656746, "step": 15598 }, { "epoch": 0.312, "grad_norm": 1.921875, "grad_norm_var": 0.015372721354166667, "learning_rate": 0.0001, "loss": 3.9735, "loss/crossentropy": 1.8659257888793945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19198190420866013, "step": 15600 }, { "epoch": 0.31204, "grad_norm": 2.046875, "grad_norm_var": 0.014078521728515625, "learning_rate": 0.0001, "loss": 3.9842, "loss/crossentropy": 2.0340747833251953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19963373243808746, "step": 15602 }, { "epoch": 0.31208, "grad_norm": 2.09375, "grad_norm_var": 0.00931396484375, "learning_rate": 0.0001, "loss": 4.1192, "loss/crossentropy": 2.0676557421684265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20130064338445663, "step": 15604 }, { "epoch": 0.31212, "grad_norm": 1.84375, "grad_norm_var": 0.008397420247395834, "learning_rate": 0.0001, "loss": 3.8069, "loss/crossentropy": 2.038145124912262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034953236579895, "step": 15606 }, { "epoch": 0.31216, "grad_norm": 2.65625, "grad_norm_var": 0.03449605305989583, "learning_rate": 0.0001, "loss": 4.3615, "loss/crossentropy": 2.3371706008911133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21727421879768372, "step": 15608 }, { "epoch": 0.3122, "grad_norm": 2.03125, "grad_norm_var": 0.03327611287434896, "learning_rate": 0.0001, "loss": 4.1988, "loss/crossentropy": 2.3314318656921387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064831778407097, "step": 15610 }, { "epoch": 0.31224, "grad_norm": 1.9375, "grad_norm_var": 0.03206558227539062, "learning_rate": 0.0001, "loss": 3.9634, "loss/crossentropy": 1.8074566721916199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1755329668521881, "step": 15612 }, { "epoch": 0.31228, "grad_norm": 1.96875, "grad_norm_var": 0.032138824462890625, "learning_rate": 0.0001, "loss": 4.1849, "loss/crossentropy": 2.1219228506088257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23183195292949677, "step": 15614 }, { "epoch": 0.31232, "grad_norm": 2.03125, "grad_norm_var": 0.031434885660807294, "learning_rate": 0.0001, "loss": 4.1418, "loss/crossentropy": 2.3573983907699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20713336020708084, "step": 15616 }, { "epoch": 0.31236, "grad_norm": 1.859375, "grad_norm_var": 0.033933258056640624, "learning_rate": 0.0001, "loss": 4.048, "loss/crossentropy": 2.0748316049575806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19384868443012238, "step": 15618 }, { "epoch": 0.3124, "grad_norm": 2.03125, "grad_norm_var": 0.033882395426432295, "learning_rate": 0.0001, "loss": 4.1834, "loss/crossentropy": 1.7933568358421326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19711802154779434, "step": 15620 }, { "epoch": 0.31244, "grad_norm": 2.046875, "grad_norm_var": 0.03115819295247396, "learning_rate": 0.0001, "loss": 4.2023, "loss/crossentropy": 1.8922778367996216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19847699999809265, "step": 15622 }, { "epoch": 0.31248, "grad_norm": 1.9765625, "grad_norm_var": 0.0048004150390625, "learning_rate": 0.0001, "loss": 4.2326, "loss/crossentropy": 2.3432679176330566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22804179042577744, "step": 15624 }, { "epoch": 0.31252, "grad_norm": 2.203125, "grad_norm_var": 0.007469685872395834, "learning_rate": 0.0001, "loss": 4.1215, "loss/crossentropy": 2.340356707572937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22454006224870682, "step": 15626 }, { "epoch": 0.31256, "grad_norm": 1.8046875, "grad_norm_var": 0.010737864176432292, "learning_rate": 0.0001, "loss": 4.0874, "loss/crossentropy": 2.037777841091156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20374249666929245, "step": 15628 }, { "epoch": 0.3126, "grad_norm": 2.078125, "grad_norm_var": 0.010562896728515625, "learning_rate": 0.0001, "loss": 4.3193, "loss/crossentropy": 2.2576274275779724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19910124689340591, "step": 15630 }, { "epoch": 0.31264, "grad_norm": 1.8828125, "grad_norm_var": 0.011302693684895834, "learning_rate": 0.0001, "loss": 4.1561, "loss/crossentropy": 2.1354891061782837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20646820217370987, "step": 15632 }, { "epoch": 0.31268, "grad_norm": 2.046875, "grad_norm_var": 0.011136627197265625, "learning_rate": 0.0001, "loss": 3.906, "loss/crossentropy": 1.8888733386993408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20730097591876984, "step": 15634 }, { "epoch": 0.31272, "grad_norm": 1.921875, "grad_norm_var": 0.012474568684895833, "learning_rate": 0.0001, "loss": 3.9522, "loss/crossentropy": 2.1037773489952087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20308485627174377, "step": 15636 }, { "epoch": 0.31276, "grad_norm": 2.21875, "grad_norm_var": 0.017085774739583334, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 1.996969223022461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2080031782388687, "step": 15638 }, { "epoch": 0.3128, "grad_norm": 2.671875, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 3.915, "loss/crossentropy": 2.0528018474578857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248053327202797, "step": 15640 }, { "epoch": 0.31284, "grad_norm": 1.9765625, "grad_norm_var": 0.044864654541015625, "learning_rate": 0.0001, "loss": 4.1718, "loss/crossentropy": 2.270516335964203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19578035175800323, "step": 15642 }, { "epoch": 0.31288, "grad_norm": 2.046875, "grad_norm_var": 0.0400299072265625, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 2.146915912628174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21282948553562164, "step": 15644 }, { "epoch": 0.31292, "grad_norm": 2.125, "grad_norm_var": 0.041562652587890624, "learning_rate": 0.0001, "loss": 4.3275, "loss/crossentropy": 2.058235287666321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20604285597801208, "step": 15646 }, { "epoch": 0.31296, "grad_norm": 2.015625, "grad_norm_var": 0.0414215087890625, "learning_rate": 0.0001, "loss": 4.4304, "loss/crossentropy": 2.3217705488204956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24202678352594376, "step": 15648 }, { "epoch": 0.313, "grad_norm": 2.0625, "grad_norm_var": 0.041257476806640624, "learning_rate": 0.0001, "loss": 4.1372, "loss/crossentropy": 2.108501672744751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20358111709356308, "step": 15650 }, { "epoch": 0.31304, "grad_norm": 1.859375, "grad_norm_var": 0.041025543212890626, "learning_rate": 0.0001, "loss": 4.1224, "loss/crossentropy": 2.0875802636146545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20186328142881393, "step": 15652 }, { "epoch": 0.31308, "grad_norm": 2.0625, "grad_norm_var": 0.036321767171223956, "learning_rate": 0.0001, "loss": 4.1994, "loss/crossentropy": 2.0159415006637573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053827941417694, "step": 15654 }, { "epoch": 0.31312, "grad_norm": 2.015625, "grad_norm_var": 0.008402252197265625, "learning_rate": 0.0001, "loss": 3.9711, "loss/crossentropy": 1.8664127588272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18666206300258636, "step": 15656 }, { "epoch": 0.31316, "grad_norm": 1.9609375, "grad_norm_var": 0.008548736572265625, "learning_rate": 0.0001, "loss": 3.9399, "loss/crossentropy": 1.9351357221603394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189142145216465, "step": 15658 }, { "epoch": 0.3132, "grad_norm": 1.96875, "grad_norm_var": 0.010872141520182291, "learning_rate": 0.0001, "loss": 4.3288, "loss/crossentropy": 1.9937176704406738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18212904781103134, "step": 15660 }, { "epoch": 0.31324, "grad_norm": 2.03125, "grad_norm_var": 0.009801991780598958, "learning_rate": 0.0001, "loss": 4.1238, "loss/crossentropy": 2.0626447796821594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20908795297145844, "step": 15662 }, { "epoch": 0.31328, "grad_norm": 1.9375, "grad_norm_var": 0.008506011962890626, "learning_rate": 0.0001, "loss": 4.0436, "loss/crossentropy": 1.8491488695144653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18241358548402786, "step": 15664 }, { "epoch": 0.31332, "grad_norm": 1.8828125, "grad_norm_var": 0.0084625244140625, "learning_rate": 0.0001, "loss": 3.9885, "loss/crossentropy": 2.2744827270507812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295297086238861, "step": 15666 }, { "epoch": 0.31336, "grad_norm": 1.8984375, "grad_norm_var": 0.0078521728515625, "learning_rate": 0.0001, "loss": 3.9213, "loss/crossentropy": 2.30733585357666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23149366676807404, "step": 15668 }, { "epoch": 0.3134, "grad_norm": 2.140625, "grad_norm_var": 0.00892333984375, "learning_rate": 0.0001, "loss": 4.1357, "loss/crossentropy": 2.038204550743103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19608831405639648, "step": 15670 }, { "epoch": 0.31344, "grad_norm": 1.953125, "grad_norm_var": 0.008676910400390625, "learning_rate": 0.0001, "loss": 4.1235, "loss/crossentropy": 2.076035261154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21801196038722992, "step": 15672 }, { "epoch": 0.31348, "grad_norm": 1.7734375, "grad_norm_var": 0.011310831705729166, "learning_rate": 0.0001, "loss": 4.0923, "loss/crossentropy": 1.9234278202056885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18380460888147354, "step": 15674 }, { "epoch": 0.31352, "grad_norm": 2.125, "grad_norm_var": 0.00931396484375, "learning_rate": 0.0001, "loss": 4.1002, "loss/crossentropy": 1.9312421679496765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18474044650793076, "step": 15676 }, { "epoch": 0.31356, "grad_norm": 2.0625, "grad_norm_var": 0.009716542561848958, "learning_rate": 0.0001, "loss": 4.079, "loss/crossentropy": 2.014132261276245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045312151312828, "step": 15678 }, { "epoch": 0.3136, "grad_norm": 1.9765625, "grad_norm_var": 0.0099365234375, "learning_rate": 0.0001, "loss": 4.2325, "loss/crossentropy": 1.925381362438202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18306680023670197, "step": 15680 }, { "epoch": 0.31364, "grad_norm": 1.8828125, "grad_norm_var": 0.009346516927083333, "learning_rate": 0.0001, "loss": 4.0766, "loss/crossentropy": 1.932526171207428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20741897821426392, "step": 15682 }, { "epoch": 0.31368, "grad_norm": 2.015625, "grad_norm_var": 0.008963775634765626, "learning_rate": 0.0001, "loss": 4.2234, "loss/crossentropy": 2.1144750714302063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19398797303438187, "step": 15684 }, { "epoch": 0.31372, "grad_norm": 1.9296875, "grad_norm_var": 0.0076983133951822914, "learning_rate": 0.0001, "loss": 3.8411, "loss/crossentropy": 1.8115296363830566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869400143623352, "step": 15686 }, { "epoch": 0.31376, "grad_norm": 2.296875, "grad_norm_var": 0.014686838785807291, "learning_rate": 0.0001, "loss": 4.0935, "loss/crossentropy": 2.0550562739372253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093418911099434, "step": 15688 }, { "epoch": 0.3138, "grad_norm": 1.8046875, "grad_norm_var": 0.013541412353515626, "learning_rate": 0.0001, "loss": 3.8549, "loss/crossentropy": 2.144737482070923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20025025308132172, "step": 15690 }, { "epoch": 0.31384, "grad_norm": 1.8984375, "grad_norm_var": 0.012629191080729166, "learning_rate": 0.0001, "loss": 3.9911, "loss/crossentropy": 2.2495052814483643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21178698539733887, "step": 15692 }, { "epoch": 0.31388, "grad_norm": 2.03125, "grad_norm_var": 0.012740071614583333, "learning_rate": 0.0001, "loss": 4.2208, "loss/crossentropy": 2.1858155727386475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20891498774290085, "step": 15694 }, { "epoch": 0.31392, "grad_norm": 1.9609375, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 4.1841, "loss/crossentropy": 2.1480127573013306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149479240179062, "step": 15696 }, { "epoch": 0.31396, "grad_norm": 1.9921875, "grad_norm_var": 0.013329060872395833, "learning_rate": 0.0001, "loss": 3.9218, "loss/crossentropy": 2.1585733294487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20317788422107697, "step": 15698 }, { "epoch": 0.314, "grad_norm": 1.9609375, "grad_norm_var": 0.2103179931640625, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 2.1281010508537292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032032608985901, "step": 15700 }, { "epoch": 0.31404, "grad_norm": 1.9375, "grad_norm_var": 0.2096588134765625, "learning_rate": 0.0001, "loss": 4.0189, "loss/crossentropy": 2.160776972770691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20999448001384735, "step": 15702 }, { "epoch": 0.31408, "grad_norm": 1.9140625, "grad_norm_var": 0.20640869140625, "learning_rate": 0.0001, "loss": 3.961, "loss/crossentropy": 2.1508986949920654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21675271540880203, "step": 15704 }, { "epoch": 0.31412, "grad_norm": 2.109375, "grad_norm_var": 0.2011871337890625, "learning_rate": 0.0001, "loss": 4.1573, "loss/crossentropy": 2.083084225654602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110595017671585, "step": 15706 }, { "epoch": 0.31416, "grad_norm": 1.8046875, "grad_norm_var": 0.20275472005208334, "learning_rate": 0.0001, "loss": 3.9785, "loss/crossentropy": 2.1268805265426636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728368312120438, "step": 15708 }, { "epoch": 0.3142, "grad_norm": 2.078125, "grad_norm_var": 0.20265706380208334, "learning_rate": 0.0001, "loss": 4.4291, "loss/crossentropy": 2.156775116920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22175253927707672, "step": 15710 }, { "epoch": 0.31424, "grad_norm": 1.984375, "grad_norm_var": 0.20224583943684896, "learning_rate": 0.0001, "loss": 4.1537, "loss/crossentropy": 2.2541415691375732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22773633152246475, "step": 15712 }, { "epoch": 0.31428, "grad_norm": 2.0625, "grad_norm_var": 0.20350723266601561, "learning_rate": 0.0001, "loss": 3.9166, "loss/crossentropy": 1.6463716626167297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18172992020845413, "step": 15714 }, { "epoch": 0.31432, "grad_norm": 2.03125, "grad_norm_var": 0.011669921875, "learning_rate": 0.0001, "loss": 4.0144, "loss/crossentropy": 1.9325580596923828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19147639721632004, "step": 15716 }, { "epoch": 0.31436, "grad_norm": 2.109375, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 3.9952, "loss/crossentropy": 2.4016847610473633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058410421013832, "step": 15718 }, { "epoch": 0.3144, "grad_norm": 2.125, "grad_norm_var": 0.014338175455729166, "learning_rate": 0.0001, "loss": 4.1847, "loss/crossentropy": 2.3456791639328003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22646376490592957, "step": 15720 }, { "epoch": 0.31444, "grad_norm": 2.046875, "grad_norm_var": 0.013785807291666667, "learning_rate": 0.0001, "loss": 4.0551, "loss/crossentropy": 1.6703909635543823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1818019300699234, "step": 15722 }, { "epoch": 0.31448, "grad_norm": 1.953125, "grad_norm_var": 0.011065419514973958, "learning_rate": 0.0001, "loss": 4.0561, "loss/crossentropy": 2.0004162192344666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033773437142372, "step": 15724 }, { "epoch": 0.31452, "grad_norm": 2.0, "grad_norm_var": 0.009905751546223958, "learning_rate": 0.0001, "loss": 4.1799, "loss/crossentropy": 2.018653154373169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20320426672697067, "step": 15726 }, { "epoch": 0.31456, "grad_norm": 2.046875, "grad_norm_var": 0.009592437744140625, "learning_rate": 0.0001, "loss": 4.0991, "loss/crossentropy": 2.0881033539772034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126169577240944, "step": 15728 }, { "epoch": 0.3146, "grad_norm": 2.046875, "grad_norm_var": 0.0067942301432291664, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 2.1103954911231995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163369059562683, "step": 15730 }, { "epoch": 0.31464, "grad_norm": 2.015625, "grad_norm_var": 0.007868448893229166, "learning_rate": 0.0001, "loss": 4.142, "loss/crossentropy": 1.9757474660873413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21398526430130005, "step": 15732 }, { "epoch": 0.31468, "grad_norm": 1.921875, "grad_norm_var": 0.006304677327473958, "learning_rate": 0.0001, "loss": 3.8965, "loss/crossentropy": 2.114749312400818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21620750427246094, "step": 15734 }, { "epoch": 0.31472, "grad_norm": 1.8828125, "grad_norm_var": 0.007163238525390625, "learning_rate": 0.0001, "loss": 3.9688, "loss/crossentropy": 1.9022989869117737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895657405257225, "step": 15736 }, { "epoch": 0.31476, "grad_norm": 2.296875, "grad_norm_var": 0.013059234619140625, "learning_rate": 0.0001, "loss": 4.2254, "loss/crossentropy": 2.0457635521888733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281428635120392, "step": 15738 }, { "epoch": 0.3148, "grad_norm": 2.015625, "grad_norm_var": 0.011954752604166667, "learning_rate": 0.0001, "loss": 3.8985, "loss/crossentropy": 1.9585834741592407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003040909767151, "step": 15740 }, { "epoch": 0.31484, "grad_norm": 2.046875, "grad_norm_var": 0.014745076497395834, "learning_rate": 0.0001, "loss": 4.1682, "loss/crossentropy": 2.06454074382782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22020584344863892, "step": 15742 }, { "epoch": 0.31488, "grad_norm": 1.921875, "grad_norm_var": 0.015665690104166668, "learning_rate": 0.0001, "loss": 4.2384, "loss/crossentropy": 2.2651820182800293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22998705506324768, "step": 15744 }, { "epoch": 0.31492, "grad_norm": 1.9375, "grad_norm_var": 0.0172607421875, "learning_rate": 0.0001, "loss": 4.258, "loss/crossentropy": 2.269286036491394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21012098342180252, "step": 15746 }, { "epoch": 0.31496, "grad_norm": 1.953125, "grad_norm_var": 0.017891438802083333, "learning_rate": 0.0001, "loss": 4.0604, "loss/crossentropy": 1.6778026223182678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17330823838710785, "step": 15748 }, { "epoch": 0.315, "grad_norm": 1.78125, "grad_norm_var": 0.020775349934895833, "learning_rate": 0.0001, "loss": 3.9785, "loss/crossentropy": 1.8929405808448792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18170948326587677, "step": 15750 }, { "epoch": 0.31504, "grad_norm": 1.953125, "grad_norm_var": 0.01980768839518229, "learning_rate": 0.0001, "loss": 4.0874, "loss/crossentropy": 1.8814340233802795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097821682691574, "step": 15752 }, { "epoch": 0.31508, "grad_norm": 2.09375, "grad_norm_var": 0.014465077718098959, "learning_rate": 0.0001, "loss": 3.8622, "loss/crossentropy": 1.7101264595985413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1834770292043686, "step": 15754 }, { "epoch": 0.31512, "grad_norm": 1.828125, "grad_norm_var": 0.01607233683268229, "learning_rate": 0.0001, "loss": 3.8498, "loss/crossentropy": 2.0287395119667053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19603880494832993, "step": 15756 }, { "epoch": 0.31516, "grad_norm": 1.875, "grad_norm_var": 0.010467274983723959, "learning_rate": 0.0001, "loss": 3.7485, "loss/crossentropy": 2.0119568705558777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212313212454319, "step": 15758 }, { "epoch": 0.3152, "grad_norm": 2.015625, "grad_norm_var": 0.011441802978515625, "learning_rate": 0.0001, "loss": 4.5141, "loss/crossentropy": 1.8937278985977173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1774902269244194, "step": 15760 }, { "epoch": 0.31524, "grad_norm": 2.015625, "grad_norm_var": 0.009104156494140625, "learning_rate": 0.0001, "loss": 4.3562, "loss/crossentropy": 2.557780146598816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22111114859580994, "step": 15762 }, { "epoch": 0.31528, "grad_norm": 2.1875, "grad_norm_var": 0.011262003580729167, "learning_rate": 0.0001, "loss": 4.1646, "loss/crossentropy": 2.1467400789260864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134222611784935, "step": 15764 }, { "epoch": 0.31532, "grad_norm": 2.03125, "grad_norm_var": 0.00814208984375, "learning_rate": 0.0001, "loss": 3.9631, "loss/crossentropy": 1.9758468270301819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1853625252842903, "step": 15766 }, { "epoch": 0.31536, "grad_norm": 2.171875, "grad_norm_var": 0.009699503580729166, "learning_rate": 0.0001, "loss": 4.2144, "loss/crossentropy": 1.8620794415473938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18791437149047852, "step": 15768 }, { "epoch": 0.3154, "grad_norm": 2.03125, "grad_norm_var": 0.008893839518229167, "learning_rate": 0.0001, "loss": 4.2066, "loss/crossentropy": 2.187020480632782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102060467004776, "step": 15770 }, { "epoch": 0.31544, "grad_norm": 2.03125, "grad_norm_var": 0.006522369384765625, "learning_rate": 0.0001, "loss": 4.1828, "loss/crossentropy": 2.0049954652786255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19090547412633896, "step": 15772 }, { "epoch": 0.31548, "grad_norm": 2.03125, "grad_norm_var": 0.010357411702473958, "learning_rate": 0.0001, "loss": 4.195, "loss/crossentropy": 2.0030736327171326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20134840160608292, "step": 15774 }, { "epoch": 0.31552, "grad_norm": 2.0, "grad_norm_var": 0.010687001546223958, "learning_rate": 0.0001, "loss": 4.3092, "loss/crossentropy": 1.8941256999969482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20826192945241928, "step": 15776 }, { "epoch": 0.31556, "grad_norm": 1.8046875, "grad_norm_var": 0.015596516927083333, "learning_rate": 0.0001, "loss": 4.0767, "loss/crossentropy": 2.0645129680633545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19137012213468552, "step": 15778 }, { "epoch": 0.3156, "grad_norm": 1.9609375, "grad_norm_var": 0.014240519205729166, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 2.198352813720703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21606199443340302, "step": 15780 }, { "epoch": 0.31564, "grad_norm": 2.125, "grad_norm_var": 0.014020792643229167, "learning_rate": 0.0001, "loss": 4.1544, "loss/crossentropy": 1.9041627049446106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19438787549734116, "step": 15782 }, { "epoch": 0.31568, "grad_norm": 2.078125, "grad_norm_var": 0.027741495768229166, "learning_rate": 0.0001, "loss": 4.2615, "loss/crossentropy": 2.110167443752289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026640698313713, "step": 15784 }, { "epoch": 0.31572, "grad_norm": 2.171875, "grad_norm_var": 0.02655029296875, "learning_rate": 0.0001, "loss": 4.1932, "loss/crossentropy": 2.5422832369804382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274986326694489, "step": 15786 }, { "epoch": 0.31576, "grad_norm": 1.921875, "grad_norm_var": 0.02840576171875, "learning_rate": 0.0001, "loss": 4.2038, "loss/crossentropy": 2.106445074081421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20275159180164337, "step": 15788 }, { "epoch": 0.3158, "grad_norm": 2.375, "grad_norm_var": 0.029857381184895834, "learning_rate": 0.0001, "loss": 4.2718, "loss/crossentropy": 2.341952681541443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22970512509346008, "step": 15790 }, { "epoch": 0.31584, "grad_norm": 1.96875, "grad_norm_var": 0.030228678385416666, "learning_rate": 0.0001, "loss": 4.238, "loss/crossentropy": 2.120057761669159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21171308308839798, "step": 15792 }, { "epoch": 0.31588, "grad_norm": 1.8671875, "grad_norm_var": 0.027644856770833334, "learning_rate": 0.0001, "loss": 3.8819, "loss/crossentropy": 1.5601414442062378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20278310775756836, "step": 15794 }, { "epoch": 0.31592, "grad_norm": 1.984375, "grad_norm_var": 0.027596028645833333, "learning_rate": 0.0001, "loss": 4.0743, "loss/crossentropy": 2.0634138584136963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815958246588707, "step": 15796 }, { "epoch": 0.31596, "grad_norm": 1.953125, "grad_norm_var": 0.0312652587890625, "learning_rate": 0.0001, "loss": 3.8183, "loss/crossentropy": 1.8437300324440002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18099966645240784, "step": 15798 }, { "epoch": 0.316, "grad_norm": 1.984375, "grad_norm_var": 0.015897369384765624, "learning_rate": 0.0001, "loss": 4.1102, "loss/crossentropy": 1.9323074221611023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884680539369583, "step": 15800 }, { "epoch": 0.31604, "grad_norm": 2.265625, "grad_norm_var": 0.018027496337890626, "learning_rate": 0.0001, "loss": 4.3145, "loss/crossentropy": 2.158499598503113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23394601047039032, "step": 15802 }, { "epoch": 0.31608, "grad_norm": 1.9921875, "grad_norm_var": 0.018062337239583334, "learning_rate": 0.0001, "loss": 4.0894, "loss/crossentropy": 1.9635908007621765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20692527294158936, "step": 15804 }, { "epoch": 0.31612, "grad_norm": 1.8671875, "grad_norm_var": 0.010503896077473958, "learning_rate": 0.0001, "loss": 4.1912, "loss/crossentropy": 1.96099454164505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1971232146024704, "step": 15806 }, { "epoch": 0.31616, "grad_norm": 2.09375, "grad_norm_var": 0.010643513997395833, "learning_rate": 0.0001, "loss": 4.2205, "loss/crossentropy": 1.9690999388694763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19965435564517975, "step": 15808 }, { "epoch": 0.3162, "grad_norm": 2.015625, "grad_norm_var": 0.011354319254557292, "learning_rate": 0.0001, "loss": 4.113, "loss/crossentropy": 2.3924723863601685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192596197128296, "step": 15810 }, { "epoch": 0.31624, "grad_norm": 1.7890625, "grad_norm_var": 0.013999176025390626, "learning_rate": 0.0001, "loss": 3.8794, "loss/crossentropy": 2.137382209300995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20949169248342514, "step": 15812 }, { "epoch": 0.31628, "grad_norm": 2.0625, "grad_norm_var": 0.013103993733723958, "learning_rate": 0.0001, "loss": 4.0187, "loss/crossentropy": 1.986245334148407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19678276032209396, "step": 15814 }, { "epoch": 0.31632, "grad_norm": 1.8984375, "grad_norm_var": 0.013492584228515625, "learning_rate": 0.0001, "loss": 3.9623, "loss/crossentropy": 1.9210307598114014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18748773634433746, "step": 15816 }, { "epoch": 0.31636, "grad_norm": 2.0625, "grad_norm_var": 0.0087066650390625, "learning_rate": 0.0001, "loss": 4.1328, "loss/crossentropy": 1.9330076575279236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21187862008810043, "step": 15818 }, { "epoch": 0.3164, "grad_norm": 2.015625, "grad_norm_var": 0.007306925455729167, "learning_rate": 0.0001, "loss": 4.2475, "loss/crossentropy": 2.275822639465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2303369864821434, "step": 15820 }, { "epoch": 0.31644, "grad_norm": 1.9453125, "grad_norm_var": 0.0082275390625, "learning_rate": 0.0001, "loss": 4.3641, "loss/crossentropy": 1.9317327737808228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19543734192848206, "step": 15822 }, { "epoch": 0.31648, "grad_norm": 2.109375, "grad_norm_var": 0.008715565999348958, "learning_rate": 0.0001, "loss": 4.3209, "loss/crossentropy": 2.2146100997924805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22172697633504868, "step": 15824 }, { "epoch": 0.31652, "grad_norm": 1.9453125, "grad_norm_var": 0.007173411051432292, "learning_rate": 0.0001, "loss": 4.0226, "loss/crossentropy": 2.296873092651367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22240595519542694, "step": 15826 }, { "epoch": 0.31656, "grad_norm": 1.953125, "grad_norm_var": 0.005900065104166667, "learning_rate": 0.0001, "loss": 4.2914, "loss/crossentropy": 2.130843997001648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21288827806711197, "step": 15828 }, { "epoch": 0.3166, "grad_norm": 1.9609375, "grad_norm_var": 0.017856597900390625, "learning_rate": 0.0001, "loss": 4.1067, "loss/crossentropy": 2.042823553085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972874104976654, "step": 15830 }, { "epoch": 0.31664, "grad_norm": 1.9453125, "grad_norm_var": 0.01701838175455729, "learning_rate": 0.0001, "loss": 3.8878, "loss/crossentropy": 1.8040228486061096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19943677634000778, "step": 15832 }, { "epoch": 0.31668, "grad_norm": 2.1875, "grad_norm_var": 0.018400065104166665, "learning_rate": 0.0001, "loss": 4.2647, "loss/crossentropy": 2.085720181465149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2592329904437065, "step": 15834 }, { "epoch": 0.31672, "grad_norm": 2.046875, "grad_norm_var": 0.022334798177083334, "learning_rate": 0.0001, "loss": 3.7694, "loss/crossentropy": 1.9271164536476135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766722872853279, "step": 15836 }, { "epoch": 0.31676, "grad_norm": 2.09375, "grad_norm_var": 0.022106679280598958, "learning_rate": 0.0001, "loss": 4.0742, "loss/crossentropy": 2.188960611820221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19409415870904922, "step": 15838 }, { "epoch": 0.3168, "grad_norm": 2.0, "grad_norm_var": 0.020949045817057293, "learning_rate": 0.0001, "loss": 3.7821, "loss/crossentropy": 1.5765752792358398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16517025232315063, "step": 15840 }, { "epoch": 0.31684, "grad_norm": 2.203125, "grad_norm_var": 0.020804595947265626, "learning_rate": 0.0001, "loss": 4.3356, "loss/crossentropy": 1.9913761019706726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20838972181081772, "step": 15842 }, { "epoch": 0.31688, "grad_norm": 1.984375, "grad_norm_var": 0.034366607666015625, "learning_rate": 0.0001, "loss": 4.3525, "loss/crossentropy": 1.8618363738059998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19304241985082626, "step": 15844 }, { "epoch": 0.31692, "grad_norm": 1.9375, "grad_norm_var": 0.026911417643229168, "learning_rate": 0.0001, "loss": 3.8087, "loss/crossentropy": 1.9099775552749634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2251715511083603, "step": 15846 }, { "epoch": 0.31696, "grad_norm": 1.953125, "grad_norm_var": 0.026486968994140624, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 2.3534491062164307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19920790195465088, "step": 15848 }, { "epoch": 0.317, "grad_norm": 2.203125, "grad_norm_var": 0.02804743448893229, "learning_rate": 0.0001, "loss": 4.1676, "loss/crossentropy": 2.1890534162521362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140251249074936, "step": 15850 }, { "epoch": 0.31704, "grad_norm": 1.8671875, "grad_norm_var": 0.0298980712890625, "learning_rate": 0.0001, "loss": 3.6303, "loss/crossentropy": 1.880380094051361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18094944953918457, "step": 15852 }, { "epoch": 0.31708, "grad_norm": 2.4375, "grad_norm_var": 0.03880106608072917, "learning_rate": 0.0001, "loss": 4.3276, "loss/crossentropy": 1.8723257184028625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101321741938591, "step": 15854 }, { "epoch": 0.31712, "grad_norm": 1.953125, "grad_norm_var": 0.039582316080729166, "learning_rate": 0.0001, "loss": 4.077, "loss/crossentropy": 2.0442943572998047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947987675666809, "step": 15856 }, { "epoch": 0.31716, "grad_norm": 2.0625, "grad_norm_var": 0.03819071451822917, "learning_rate": 0.0001, "loss": 4.2969, "loss/crossentropy": 2.0415892601013184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022911012172699, "step": 15858 }, { "epoch": 0.3172, "grad_norm": 1.875, "grad_norm_var": 0.02176513671875, "learning_rate": 0.0001, "loss": 3.8801, "loss/crossentropy": 1.8290876150131226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914537101984024, "step": 15860 }, { "epoch": 0.31724, "grad_norm": 1.859375, "grad_norm_var": 0.022904459635416666, "learning_rate": 0.0001, "loss": 3.8257, "loss/crossentropy": 1.6169148087501526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117067337036133, "step": 15862 }, { "epoch": 0.31728, "grad_norm": 2.0, "grad_norm_var": 0.022648111979166666, "learning_rate": 0.0001, "loss": 4.2553, "loss/crossentropy": 2.3701289892196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557762861251831, "step": 15864 }, { "epoch": 0.31732, "grad_norm": 2.03125, "grad_norm_var": 0.019657135009765625, "learning_rate": 0.0001, "loss": 4.2299, "loss/crossentropy": 2.2519643306732178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21390919387340546, "step": 15866 }, { "epoch": 0.31736, "grad_norm": 2.015625, "grad_norm_var": 0.015409088134765625, "learning_rate": 0.0001, "loss": 4.266, "loss/crossentropy": 2.3287068009376526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22211260348558426, "step": 15868 }, { "epoch": 0.3174, "grad_norm": 1.8515625, "grad_norm_var": 0.004401652018229166, "learning_rate": 0.0001, "loss": 3.9622, "loss/crossentropy": 2.144728899002075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148151397705078, "step": 15870 }, { "epoch": 0.31744, "grad_norm": 1.8984375, "grad_norm_var": 0.004847971598307291, "learning_rate": 0.0001, "loss": 3.9042, "loss/crossentropy": 2.120876669883728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21692734956741333, "step": 15872 }, { "epoch": 0.31748, "grad_norm": 1.8828125, "grad_norm_var": 0.004703521728515625, "learning_rate": 0.0001, "loss": 3.9254, "loss/crossentropy": 2.2827231884002686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20714152604341507, "step": 15874 }, { "epoch": 0.31752, "grad_norm": 1.84375, "grad_norm_var": 0.004788970947265625, "learning_rate": 0.0001, "loss": 3.8742, "loss/crossentropy": 1.8011438846588135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19294942915439606, "step": 15876 }, { "epoch": 0.31756, "grad_norm": 1.8515625, "grad_norm_var": 0.0048258463541666664, "learning_rate": 0.0001, "loss": 3.9082, "loss/crossentropy": 1.9359464049339294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418440759181976, "step": 15878 }, { "epoch": 0.3176, "grad_norm": 2.03125, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 4.4035, "loss/crossentropy": 2.220693826675415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2583991438150406, "step": 15880 }, { "epoch": 0.31764, "grad_norm": 1.9140625, "grad_norm_var": 0.006254069010416667, "learning_rate": 0.0001, "loss": 3.9762, "loss/crossentropy": 1.7859990000724792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756511777639389, "step": 15882 }, { "epoch": 0.31768, "grad_norm": 1.9140625, "grad_norm_var": 0.00587158203125, "learning_rate": 0.0001, "loss": 3.7008, "loss/crossentropy": 1.9461398720741272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18634959310293198, "step": 15884 }, { "epoch": 0.31772, "grad_norm": 2.234375, "grad_norm_var": 0.010833485921223959, "learning_rate": 0.0001, "loss": 4.3732, "loss/crossentropy": 2.1066328287124634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2553827613592148, "step": 15886 }, { "epoch": 0.31776, "grad_norm": 2.03125, "grad_norm_var": 0.011087799072265625, "learning_rate": 0.0001, "loss": 3.8751, "loss/crossentropy": 1.7809287309646606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19362767785787582, "step": 15888 }, { "epoch": 0.3178, "grad_norm": 1.921875, "grad_norm_var": 0.01077880859375, "learning_rate": 0.0001, "loss": 4.0424, "loss/crossentropy": 1.9554992318153381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18544971197843552, "step": 15890 }, { "epoch": 0.31784, "grad_norm": 1.84375, "grad_norm_var": 0.0327392578125, "learning_rate": 0.0001, "loss": 3.9083, "loss/crossentropy": 1.5843109488487244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15569136291742325, "step": 15892 }, { "epoch": 0.31788, "grad_norm": 2.109375, "grad_norm_var": 0.0307525634765625, "learning_rate": 0.0001, "loss": 4.1213, "loss/crossentropy": 2.2000880241394043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2167157679796219, "step": 15894 }, { "epoch": 0.31792, "grad_norm": 1.9765625, "grad_norm_var": 0.031870269775390626, "learning_rate": 0.0001, "loss": 4.265, "loss/crossentropy": 2.2597590684890747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21171507239341736, "step": 15896 }, { "epoch": 0.31796, "grad_norm": 1.9296875, "grad_norm_var": 0.03181330362955729, "learning_rate": 0.0001, "loss": 3.9516, "loss/crossentropy": 2.0513627529144287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21095634251832962, "step": 15898 }, { "epoch": 0.318, "grad_norm": 1.9765625, "grad_norm_var": 0.029271443684895832, "learning_rate": 0.0001, "loss": 4.0667, "loss/crossentropy": 2.1822222471237183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117551624774933, "step": 15900 }, { "epoch": 0.31804, "grad_norm": 2.125, "grad_norm_var": 0.027784983317057293, "learning_rate": 0.0001, "loss": 4.3526, "loss/crossentropy": 1.9500460624694824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20769241452217102, "step": 15902 }, { "epoch": 0.31808, "grad_norm": 2.0625, "grad_norm_var": 0.02684326171875, "learning_rate": 0.0001, "loss": 4.2862, "loss/crossentropy": 2.179791212081909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20147693157196045, "step": 15904 }, { "epoch": 0.31812, "grad_norm": 2.03125, "grad_norm_var": 0.025217437744140626, "learning_rate": 0.0001, "loss": 4.0485, "loss/crossentropy": 1.939364731311798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18659047782421112, "step": 15906 }, { "epoch": 0.31816, "grad_norm": 1.9140625, "grad_norm_var": 0.0059478759765625, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 2.219409167766571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19967788457870483, "step": 15908 }, { "epoch": 0.3182, "grad_norm": 1.9296875, "grad_norm_var": 0.007741038004557292, "learning_rate": 0.0001, "loss": 3.829, "loss/crossentropy": 1.8363550901412964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18773943930864334, "step": 15910 }, { "epoch": 0.31824, "grad_norm": 2.0625, "grad_norm_var": 0.006151326497395833, "learning_rate": 0.0001, "loss": 4.2196, "loss/crossentropy": 1.779226541519165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20676184445619583, "step": 15912 }, { "epoch": 0.31828, "grad_norm": 1.953125, "grad_norm_var": 0.006172688802083334, "learning_rate": 0.0001, "loss": 3.9532, "loss/crossentropy": 2.1175760626792908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20498017221689224, "step": 15914 }, { "epoch": 0.31832, "grad_norm": 1.9609375, "grad_norm_var": 0.005893707275390625, "learning_rate": 0.0001, "loss": 4.1601, "loss/crossentropy": 2.0964329838752747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1898122876882553, "step": 15916 }, { "epoch": 0.31836, "grad_norm": 1.90625, "grad_norm_var": 0.005680084228515625, "learning_rate": 0.0001, "loss": 3.9093, "loss/crossentropy": 1.9687228798866272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931748390197754, "step": 15918 }, { "epoch": 0.3184, "grad_norm": 2.015625, "grad_norm_var": 0.005197906494140625, "learning_rate": 0.0001, "loss": 4.0003, "loss/crossentropy": 1.8344767689704895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19926829636096954, "step": 15920 }, { "epoch": 0.31844, "grad_norm": 1.984375, "grad_norm_var": 0.004443359375, "learning_rate": 0.0001, "loss": 4.0426, "loss/crossentropy": 2.154146194458008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21284651011228561, "step": 15922 }, { "epoch": 0.31848, "grad_norm": 2.03125, "grad_norm_var": 0.004223378499348959, "learning_rate": 0.0001, "loss": 4.2112, "loss/crossentropy": 1.877986490726471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18935512751340866, "step": 15924 }, { "epoch": 0.31852, "grad_norm": 2.171875, "grad_norm_var": 0.009297434488932292, "learning_rate": 0.0001, "loss": 4.4296, "loss/crossentropy": 2.361938714981079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23210977017879486, "step": 15926 }, { "epoch": 0.31856, "grad_norm": 2.109375, "grad_norm_var": 0.01043701171875, "learning_rate": 0.0001, "loss": 4.1209, "loss/crossentropy": 2.3886083364486694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22222661972045898, "step": 15928 }, { "epoch": 0.3186, "grad_norm": 2.25, "grad_norm_var": 0.014806874593098958, "learning_rate": 0.0001, "loss": 4.1164, "loss/crossentropy": 2.073238492012024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20130325853824615, "step": 15930 }, { "epoch": 0.31864, "grad_norm": 2.125, "grad_norm_var": 0.015569814046223958, "learning_rate": 0.0001, "loss": 4.1692, "loss/crossentropy": 1.9937713742256165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199419766664505, "step": 15932 }, { "epoch": 0.31868, "grad_norm": 1.96875, "grad_norm_var": 0.0135894775390625, "learning_rate": 0.0001, "loss": 4.0594, "loss/crossentropy": 2.003056764602661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18682510405778885, "step": 15934 }, { "epoch": 0.31872, "grad_norm": 1.984375, "grad_norm_var": 0.0137359619140625, "learning_rate": 0.0001, "loss": 3.9555, "loss/crossentropy": 1.7363762259483337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17377281934022903, "step": 15936 }, { "epoch": 0.31876, "grad_norm": 2.0, "grad_norm_var": 0.014412180582682291, "learning_rate": 0.0001, "loss": 4.1321, "loss/crossentropy": 1.9791433811187744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24057063460350037, "step": 15938 }, { "epoch": 0.3188, "grad_norm": 2.0625, "grad_norm_var": 0.011378733317057292, "learning_rate": 0.0001, "loss": 3.9662, "loss/crossentropy": 1.9962583780288696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19619113206863403, "step": 15940 }, { "epoch": 0.31884, "grad_norm": 1.9140625, "grad_norm_var": 0.01053466796875, "learning_rate": 0.0001, "loss": 4.2093, "loss/crossentropy": 1.8568945527076721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979837343096733, "step": 15942 }, { "epoch": 0.31888, "grad_norm": 2.09375, "grad_norm_var": 0.008991495768229166, "learning_rate": 0.0001, "loss": 4.0148, "loss/crossentropy": 2.031070291996002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951117366552353, "step": 15944 }, { "epoch": 0.31892, "grad_norm": 1.9375, "grad_norm_var": 0.007916005452473958, "learning_rate": 0.0001, "loss": 4.0139, "loss/crossentropy": 2.2226544618606567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21066214889287949, "step": 15946 }, { "epoch": 0.31896, "grad_norm": 1.890625, "grad_norm_var": 0.007783762613932292, "learning_rate": 0.0001, "loss": 4.0275, "loss/crossentropy": 2.090229034423828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.225972518324852, "step": 15948 }, { "epoch": 0.319, "grad_norm": 1.90625, "grad_norm_var": 0.0080078125, "learning_rate": 0.0001, "loss": 3.9792, "loss/crossentropy": 1.8562734723091125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17633049190044403, "step": 15950 }, { "epoch": 0.31904, "grad_norm": 2.046875, "grad_norm_var": 0.00992431640625, "learning_rate": 0.0001, "loss": 4.0227, "loss/crossentropy": 1.6677230596542358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16521989554166794, "step": 15952 }, { "epoch": 0.31908, "grad_norm": 1.8828125, "grad_norm_var": 0.008939361572265625, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 2.1918782591819763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21977706253528595, "step": 15954 }, { "epoch": 0.31912, "grad_norm": 1.8984375, "grad_norm_var": 0.008701324462890625, "learning_rate": 0.0001, "loss": 3.8932, "loss/crossentropy": 1.8623137474060059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937999278306961, "step": 15956 }, { "epoch": 0.31916, "grad_norm": 2.296875, "grad_norm_var": 0.014839426676432291, "learning_rate": 0.0001, "loss": 4.2378, "loss/crossentropy": 2.262889266014099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19410346448421478, "step": 15958 }, { "epoch": 0.3192, "grad_norm": 1.9609375, "grad_norm_var": 0.014654286702473958, "learning_rate": 0.0001, "loss": 4.0487, "loss/crossentropy": 1.9620846509933472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1978064849972725, "step": 15960 }, { "epoch": 0.31924, "grad_norm": 1.9375, "grad_norm_var": 0.014207967122395833, "learning_rate": 0.0001, "loss": 4.2746, "loss/crossentropy": 2.453945517539978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21578459441661835, "step": 15962 }, { "epoch": 0.31928, "grad_norm": 1.984375, "grad_norm_var": 0.013529459635416666, "learning_rate": 0.0001, "loss": 3.9692, "loss/crossentropy": 1.9565781354904175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17789364606142044, "step": 15964 }, { "epoch": 0.31932, "grad_norm": 1.9765625, "grad_norm_var": 0.012589518229166667, "learning_rate": 0.0001, "loss": 4.3504, "loss/crossentropy": 2.1175976991653442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239348292350769, "step": 15966 }, { "epoch": 0.31936, "grad_norm": 1.984375, "grad_norm_var": 0.010554758707682292, "learning_rate": 0.0001, "loss": 4.0815, "loss/crossentropy": 1.8919953107833862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19832338392734528, "step": 15968 }, { "epoch": 0.3194, "grad_norm": 2.015625, "grad_norm_var": 0.0126220703125, "learning_rate": 0.0001, "loss": 4.1282, "loss/crossentropy": 1.9757861495018005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19513867795467377, "step": 15970 }, { "epoch": 0.31944, "grad_norm": 1.90625, "grad_norm_var": 0.012630208333333334, "learning_rate": 0.0001, "loss": 4.1045, "loss/crossentropy": 1.8716764450073242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18571900576353073, "step": 15972 }, { "epoch": 0.31948, "grad_norm": 1.984375, "grad_norm_var": 0.006575520833333333, "learning_rate": 0.0001, "loss": 4.2572, "loss/crossentropy": 2.251496374607086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199217826128006, "step": 15974 }, { "epoch": 0.31952, "grad_norm": 2.140625, "grad_norm_var": 0.008424631754557292, "learning_rate": 0.0001, "loss": 4.1033, "loss/crossentropy": 1.8639826774597168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18889226019382477, "step": 15976 }, { "epoch": 0.31956, "grad_norm": 1.890625, "grad_norm_var": 0.008499908447265624, "learning_rate": 0.0001, "loss": 3.8386, "loss/crossentropy": 1.824375331401825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1809096783399582, "step": 15978 }, { "epoch": 0.3196, "grad_norm": 2.0625, "grad_norm_var": 0.010361480712890624, "learning_rate": 0.0001, "loss": 4.4576, "loss/crossentropy": 2.357889413833618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21762560307979584, "step": 15980 }, { "epoch": 0.31964, "grad_norm": 1.9921875, "grad_norm_var": 0.011527252197265626, "learning_rate": 0.0001, "loss": 4.0706, "loss/crossentropy": 2.0944892168045044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18413915485143661, "step": 15982 }, { "epoch": 0.31968, "grad_norm": 1.8984375, "grad_norm_var": 0.011736806233723958, "learning_rate": 0.0001, "loss": 3.9458, "loss/crossentropy": 1.8600184321403503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18580162525177002, "step": 15984 }, { "epoch": 0.31972, "grad_norm": 2.03125, "grad_norm_var": 0.008697255452473959, "learning_rate": 0.0001, "loss": 4.1152, "loss/crossentropy": 1.9213852882385254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20599720627069473, "step": 15986 }, { "epoch": 0.31976, "grad_norm": 1.9453125, "grad_norm_var": 0.0119537353515625, "learning_rate": 0.0001, "loss": 4.2187, "loss/crossentropy": 2.0295584201812744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125483900308609, "step": 15988 }, { "epoch": 0.3198, "grad_norm": 2.09375, "grad_norm_var": 0.012198638916015626, "learning_rate": 0.0001, "loss": 4.2448, "loss/crossentropy": 2.148836612701416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22027160972356796, "step": 15990 }, { "epoch": 0.31984, "grad_norm": 2.03125, "grad_norm_var": 0.010201009114583333, "learning_rate": 0.0001, "loss": 4.122, "loss/crossentropy": 1.9927499294281006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20413245260715485, "step": 15992 }, { "epoch": 0.31988, "grad_norm": 2.21875, "grad_norm_var": 0.0113433837890625, "learning_rate": 0.0001, "loss": 4.2675, "loss/crossentropy": 2.090071678161621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21670149266719818, "step": 15994 }, { "epoch": 0.31992, "grad_norm": 1.953125, "grad_norm_var": 0.010074869791666666, "learning_rate": 0.0001, "loss": 4.1333, "loss/crossentropy": 1.8375160098075867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938413679599762, "step": 15996 }, { "epoch": 0.31996, "grad_norm": 1.9140625, "grad_norm_var": 0.010498046875, "learning_rate": 0.0001, "loss": 3.8915, "loss/crossentropy": 1.5196507573127747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16407200694084167, "step": 15998 }, { "epoch": 0.32, "grad_norm": 1.859375, "grad_norm_var": 0.011344146728515626, "learning_rate": 0.0001, "loss": 4.0965, "loss/crossentropy": 2.0738461017608643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21532364934682846, "step": 16000 }, { "epoch": 0.32004, "grad_norm": 1.984375, "grad_norm_var": 0.011948394775390624, "learning_rate": 0.0001, "loss": 4.0687, "loss/crossentropy": 2.3243744373321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23544494807720184, "step": 16002 }, { "epoch": 0.32008, "grad_norm": 2.046875, "grad_norm_var": 0.009862263997395834, "learning_rate": 0.0001, "loss": 3.9637, "loss/crossentropy": 1.7145931124687195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816524863243103, "step": 16004 }, { "epoch": 0.32012, "grad_norm": 2.0625, "grad_norm_var": 0.010506184895833333, "learning_rate": 0.0001, "loss": 4.2317, "loss/crossentropy": 2.369017481803894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21978877484798431, "step": 16006 }, { "epoch": 0.32016, "grad_norm": 1.984375, "grad_norm_var": 0.010276031494140626, "learning_rate": 0.0001, "loss": 4.2283, "loss/crossentropy": 2.2246848344802856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21983492374420166, "step": 16008 }, { "epoch": 0.3202, "grad_norm": 2.671875, "grad_norm_var": 0.03774185180664062, "learning_rate": 0.0001, "loss": 4.4077, "loss/crossentropy": 2.0235647559165955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22780708968639374, "step": 16010 }, { "epoch": 0.32024, "grad_norm": 2.015625, "grad_norm_var": 0.03724543253580729, "learning_rate": 0.0001, "loss": 4.1692, "loss/crossentropy": 2.3185853958129883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22308620065450668, "step": 16012 }, { "epoch": 0.32028, "grad_norm": 2.109375, "grad_norm_var": 0.0337890625, "learning_rate": 0.0001, "loss": 4.1314, "loss/crossentropy": 1.8504603505134583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909484937787056, "step": 16014 }, { "epoch": 0.32032, "grad_norm": 1.859375, "grad_norm_var": 0.03502604166666667, "learning_rate": 0.0001, "loss": 3.8247, "loss/crossentropy": 1.7095224857330322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18588167428970337, "step": 16016 }, { "epoch": 0.32036, "grad_norm": 1.8515625, "grad_norm_var": 0.03749974568684896, "learning_rate": 0.0001, "loss": 4.1015, "loss/crossentropy": 1.8398523330688477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18085161596536636, "step": 16018 }, { "epoch": 0.3204, "grad_norm": 1.921875, "grad_norm_var": 0.03569717407226562, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 2.297620415687561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20937249064445496, "step": 16020 }, { "epoch": 0.32044, "grad_norm": 1.984375, "grad_norm_var": 0.03618545532226562, "learning_rate": 0.0001, "loss": 4.3518, "loss/crossentropy": 1.9124106764793396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072141021490097, "step": 16022 }, { "epoch": 0.32048, "grad_norm": 1.921875, "grad_norm_var": 0.03752415974934896, "learning_rate": 0.0001, "loss": 4.2695, "loss/crossentropy": 2.119450092315674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20875975489616394, "step": 16024 }, { "epoch": 0.32052, "grad_norm": 2.125, "grad_norm_var": 0.008125559488932291, "learning_rate": 0.0001, "loss": 4.35, "loss/crossentropy": 1.722611904144287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1751066967844963, "step": 16026 }, { "epoch": 0.32056, "grad_norm": 1.8984375, "grad_norm_var": 0.008760579427083333, "learning_rate": 0.0001, "loss": 4.0689, "loss/crossentropy": 2.246767997741699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999046877026558, "step": 16028 }, { "epoch": 0.3206, "grad_norm": 1.9765625, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 3.9949, "loss/crossentropy": 2.0449349880218506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969577819108963, "step": 16030 }, { "epoch": 0.32064, "grad_norm": 2.140625, "grad_norm_var": 0.010050201416015625, "learning_rate": 0.0001, "loss": 4.1257, "loss/crossentropy": 2.125577926635742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22252507507801056, "step": 16032 }, { "epoch": 0.32068, "grad_norm": 1.984375, "grad_norm_var": 0.009079742431640624, "learning_rate": 0.0001, "loss": 3.639, "loss/crossentropy": 1.8161216974258423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17801867425441742, "step": 16034 }, { "epoch": 0.32072, "grad_norm": 2.0, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 3.9475, "loss/crossentropy": 2.0116894841194153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18890849500894547, "step": 16036 }, { "epoch": 0.32076, "grad_norm": 1.875, "grad_norm_var": 0.0100982666015625, "learning_rate": 0.0001, "loss": 3.8274, "loss/crossentropy": 1.505588710308075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17151623219251633, "step": 16038 }, { "epoch": 0.3208, "grad_norm": 1.9296875, "grad_norm_var": 0.008585357666015625, "learning_rate": 0.0001, "loss": 4.2162, "loss/crossentropy": 2.1405014991760254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2036641389131546, "step": 16040 }, { "epoch": 0.32084, "grad_norm": 1.953125, "grad_norm_var": 0.006628163655598958, "learning_rate": 0.0001, "loss": 4.0743, "loss/crossentropy": 2.1562809348106384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301954984664917, "step": 16042 }, { "epoch": 0.32088, "grad_norm": 1.8984375, "grad_norm_var": 0.006628163655598958, "learning_rate": 0.0001, "loss": 3.9773, "loss/crossentropy": 1.9317167401313782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19167031347751617, "step": 16044 }, { "epoch": 0.32092, "grad_norm": 1.796875, "grad_norm_var": 0.007330067952473958, "learning_rate": 0.0001, "loss": 4.0523, "loss/crossentropy": 1.6257455348968506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1676744669675827, "step": 16046 }, { "epoch": 0.32096, "grad_norm": 2.109375, "grad_norm_var": 0.006640625, "learning_rate": 0.0001, "loss": 4.2454, "loss/crossentropy": 2.0711347460746765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268730029463768, "step": 16048 }, { "epoch": 0.321, "grad_norm": 2.03125, "grad_norm_var": 0.0071489969889322914, "learning_rate": 0.0001, "loss": 4.3333, "loss/crossentropy": 2.173452377319336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981828361749649, "step": 16050 }, { "epoch": 0.32104, "grad_norm": 1.953125, "grad_norm_var": 0.009593709309895834, "learning_rate": 0.0001, "loss": 4.4601, "loss/crossentropy": 2.348356604576111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2538425847887993, "step": 16052 }, { "epoch": 0.32108, "grad_norm": 1.984375, "grad_norm_var": 0.0089263916015625, "learning_rate": 0.0001, "loss": 4.3408, "loss/crossentropy": 2.5441235303878784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22638535499572754, "step": 16054 }, { "epoch": 0.32112, "grad_norm": 1.8515625, "grad_norm_var": 0.0098876953125, "learning_rate": 0.0001, "loss": 4.08, "loss/crossentropy": 2.129339337348938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20430771261453629, "step": 16056 }, { "epoch": 0.32116, "grad_norm": 1.9296875, "grad_norm_var": 0.01142578125, "learning_rate": 0.0001, "loss": 3.9048, "loss/crossentropy": 2.1599501371383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20718347281217575, "step": 16058 }, { "epoch": 0.3212, "grad_norm": 1.984375, "grad_norm_var": 0.010296376546223958, "learning_rate": 0.0001, "loss": 4.2746, "loss/crossentropy": 2.2704890966415405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22025802731513977, "step": 16060 }, { "epoch": 0.32124, "grad_norm": 1.796875, "grad_norm_var": 0.011702473958333333, "learning_rate": 0.0001, "loss": 3.7282, "loss/crossentropy": 1.7704021334648132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17811128497123718, "step": 16062 }, { "epoch": 0.32128, "grad_norm": 2.1875, "grad_norm_var": 0.015705362955729166, "learning_rate": 0.0001, "loss": 4.2366, "loss/crossentropy": 1.8030991554260254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20078134536743164, "step": 16064 }, { "epoch": 0.32132, "grad_norm": 2.015625, "grad_norm_var": 0.0161041259765625, "learning_rate": 0.0001, "loss": 3.9832, "loss/crossentropy": 2.14484703540802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21000836789608002, "step": 16066 }, { "epoch": 0.32136, "grad_norm": 2.203125, "grad_norm_var": 0.016521962483723958, "learning_rate": 0.0001, "loss": 4.0812, "loss/crossentropy": 2.1680833101272583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22287416458129883, "step": 16068 }, { "epoch": 0.3214, "grad_norm": 2.078125, "grad_norm_var": 0.01708958943684896, "learning_rate": 0.0001, "loss": 4.1355, "loss/crossentropy": 1.9971441626548767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20709602534770966, "step": 16070 }, { "epoch": 0.32144, "grad_norm": 2.09375, "grad_norm_var": 0.016462198893229165, "learning_rate": 0.0001, "loss": 4.2735, "loss/crossentropy": 2.180538833141327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097538262605667, "step": 16072 }, { "epoch": 0.32148, "grad_norm": 2.046875, "grad_norm_var": 0.013529459635416666, "learning_rate": 0.0001, "loss": 4.0725, "loss/crossentropy": 2.074296534061432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20961299538612366, "step": 16074 }, { "epoch": 0.32152, "grad_norm": 1.9375, "grad_norm_var": 0.014998372395833333, "learning_rate": 0.0001, "loss": 4.3, "loss/crossentropy": 2.0936360359191895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18253444135189056, "step": 16076 }, { "epoch": 0.32156, "grad_norm": 1.8984375, "grad_norm_var": 0.0121337890625, "learning_rate": 0.0001, "loss": 3.9992, "loss/crossentropy": 2.1265164613723755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127028927206993, "step": 16078 }, { "epoch": 0.3216, "grad_norm": 2.0, "grad_norm_var": 0.008410390218098958, "learning_rate": 0.0001, "loss": 4.1907, "loss/crossentropy": 2.0804547667503357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23104286193847656, "step": 16080 }, { "epoch": 0.32164, "grad_norm": 1.9921875, "grad_norm_var": 0.007759602864583334, "learning_rate": 0.0001, "loss": 3.9653, "loss/crossentropy": 1.9458459615707397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18746595829725266, "step": 16082 }, { "epoch": 0.32168, "grad_norm": 1.96875, "grad_norm_var": 0.0056793212890625, "learning_rate": 0.0001, "loss": 4.0581, "loss/crossentropy": 2.186868667602539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20350559055805206, "step": 16084 }, { "epoch": 0.32172, "grad_norm": 1.953125, "grad_norm_var": 0.007340494791666667, "learning_rate": 0.0001, "loss": 4.2364, "loss/crossentropy": 1.5468108654022217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17710693180561066, "step": 16086 }, { "epoch": 0.32176, "grad_norm": 2.0, "grad_norm_var": 0.006180572509765625, "learning_rate": 0.0001, "loss": 4.2635, "loss/crossentropy": 2.222296118736267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22016968578100204, "step": 16088 }, { "epoch": 0.3218, "grad_norm": 1.984375, "grad_norm_var": 0.005619049072265625, "learning_rate": 0.0001, "loss": 4.0659, "loss/crossentropy": 1.7950996160507202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892552226781845, "step": 16090 }, { "epoch": 0.32184, "grad_norm": 2.15625, "grad_norm_var": 0.007120768229166667, "learning_rate": 0.0001, "loss": 4.2899, "loss/crossentropy": 2.293117642402649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2351122871041298, "step": 16092 }, { "epoch": 0.32188, "grad_norm": 1.890625, "grad_norm_var": 0.006208292643229167, "learning_rate": 0.0001, "loss": 3.8235, "loss/crossentropy": 1.6602438688278198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18052196502685547, "step": 16094 }, { "epoch": 0.32192, "grad_norm": 2.140625, "grad_norm_var": 0.008501942952473958, "learning_rate": 0.0001, "loss": 4.114, "loss/crossentropy": 1.9335945844650269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20585428178310394, "step": 16096 }, { "epoch": 0.32196, "grad_norm": 1.9140625, "grad_norm_var": 0.011378733317057292, "learning_rate": 0.0001, "loss": 4.3911, "loss/crossentropy": 2.0107831358909607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19297368824481964, "step": 16098 }, { "epoch": 0.322, "grad_norm": 2.0, "grad_norm_var": 0.011801910400390626, "learning_rate": 0.0001, "loss": 4.0153, "loss/crossentropy": 1.9804013967514038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17699319869279861, "step": 16100 }, { "epoch": 0.32204, "grad_norm": 1.9375, "grad_norm_var": 0.01048583984375, "learning_rate": 0.0001, "loss": 4.1069, "loss/crossentropy": 1.9009913206100464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931428462266922, "step": 16102 }, { "epoch": 0.32208, "grad_norm": 2.140625, "grad_norm_var": 0.011358388264973958, "learning_rate": 0.0001, "loss": 4.5134, "loss/crossentropy": 1.9558063745498657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139815017580986, "step": 16104 }, { "epoch": 0.32212, "grad_norm": 2.1875, "grad_norm_var": 0.013323720296223958, "learning_rate": 0.0001, "loss": 4.0989, "loss/crossentropy": 2.0626373887062073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022782266139984, "step": 16106 }, { "epoch": 0.32216, "grad_norm": 1.8515625, "grad_norm_var": 0.013814036051432292, "learning_rate": 0.0001, "loss": 3.9408, "loss/crossentropy": 2.166835308074951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22070696204900742, "step": 16108 }, { "epoch": 0.3222, "grad_norm": 1.9453125, "grad_norm_var": 0.012963612874348959, "learning_rate": 0.0001, "loss": 4.1382, "loss/crossentropy": 2.2970025539398193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23291311413049698, "step": 16110 }, { "epoch": 0.32224, "grad_norm": 1.921875, "grad_norm_var": 0.012174224853515625, "learning_rate": 0.0001, "loss": 3.9577, "loss/crossentropy": 1.9400970935821533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199058435857296, "step": 16112 }, { "epoch": 0.32228, "grad_norm": 1.9453125, "grad_norm_var": 0.009110260009765624, "learning_rate": 0.0001, "loss": 3.9659, "loss/crossentropy": 1.6467864513397217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19500216841697693, "step": 16114 }, { "epoch": 0.32232, "grad_norm": 2.03125, "grad_norm_var": 0.010546875, "learning_rate": 0.0001, "loss": 4.1791, "loss/crossentropy": 2.477790355682373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22540342807769775, "step": 16116 }, { "epoch": 0.32236, "grad_norm": 1.875, "grad_norm_var": 0.011066691080729166, "learning_rate": 0.0001, "loss": 4.0661, "loss/crossentropy": 1.946933627128601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17866672575473785, "step": 16118 }, { "epoch": 0.3224, "grad_norm": 1.859375, "grad_norm_var": 0.01153564453125, "learning_rate": 0.0001, "loss": 4.1416, "loss/crossentropy": 2.4484145641326904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21314629167318344, "step": 16120 }, { "epoch": 0.32244, "grad_norm": 1.8984375, "grad_norm_var": 0.009155019124348959, "learning_rate": 0.0001, "loss": 3.9754, "loss/crossentropy": 1.9663755893707275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19814299046993256, "step": 16122 }, { "epoch": 0.32248, "grad_norm": 2.125, "grad_norm_var": 0.5887278238932292, "learning_rate": 0.0001, "loss": 3.7417, "loss/crossentropy": 1.9786240458488464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19666218757629395, "step": 16124 }, { "epoch": 0.32252, "grad_norm": 1.9453125, "grad_norm_var": 0.5901079813639323, "learning_rate": 0.0001, "loss": 3.9504, "loss/crossentropy": 1.90617835521698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1803947538137436, "step": 16126 }, { "epoch": 0.32256, "grad_norm": 2.03125, "grad_norm_var": 0.5877764383951823, "learning_rate": 0.0001, "loss": 4.3267, "loss/crossentropy": 2.1199004650115967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21409574151039124, "step": 16128 }, { "epoch": 0.3226, "grad_norm": 1.8984375, "grad_norm_var": 0.5875445048014323, "learning_rate": 0.0001, "loss": 4.184, "loss/crossentropy": 2.081290364265442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20103489607572556, "step": 16130 }, { "epoch": 0.32264, "grad_norm": 1.9375, "grad_norm_var": 0.5902565002441407, "learning_rate": 0.0001, "loss": 4.216, "loss/crossentropy": 1.9839922785758972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20679790526628494, "step": 16132 }, { "epoch": 0.32268, "grad_norm": 2.078125, "grad_norm_var": 0.5830800374348958, "learning_rate": 0.0001, "loss": 4.3441, "loss/crossentropy": 2.2765486240386963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21012597531080246, "step": 16134 }, { "epoch": 0.32272, "grad_norm": 2.125, "grad_norm_var": 0.57388916015625, "learning_rate": 0.0001, "loss": 4.0003, "loss/crossentropy": 1.8353520035743713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19367212802171707, "step": 16136 }, { "epoch": 0.32276, "grad_norm": 1.8984375, "grad_norm_var": 0.5781572977701823, "learning_rate": 0.0001, "loss": 3.8104, "loss/crossentropy": 1.8009640574455261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18030469119548798, "step": 16138 }, { "epoch": 0.3228, "grad_norm": 2.140625, "grad_norm_var": 0.020975494384765626, "learning_rate": 0.0001, "loss": 4.1914, "loss/crossentropy": 1.865959644317627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20606407523155212, "step": 16140 }, { "epoch": 0.32284, "grad_norm": 2.109375, "grad_norm_var": 0.021445465087890626, "learning_rate": 0.0001, "loss": 4.0253, "loss/crossentropy": 2.238165020942688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22822897136211395, "step": 16142 }, { "epoch": 0.32288, "grad_norm": 1.90625, "grad_norm_var": 0.02393366495768229, "learning_rate": 0.0001, "loss": 4.0425, "loss/crossentropy": 2.175424814224243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2260037586092949, "step": 16144 }, { "epoch": 0.32292, "grad_norm": 1.9453125, "grad_norm_var": 0.025050608317057292, "learning_rate": 0.0001, "loss": 4.1493, "loss/crossentropy": 2.2071722745895386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23026321083307266, "step": 16146 }, { "epoch": 0.32296, "grad_norm": 2.109375, "grad_norm_var": 0.024468739827473957, "learning_rate": 0.0001, "loss": 4.2384, "loss/crossentropy": 2.211812973022461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20882735401391983, "step": 16148 }, { "epoch": 0.323, "grad_norm": 2.046875, "grad_norm_var": 0.02444636027018229, "learning_rate": 0.0001, "loss": 4.0356, "loss/crossentropy": 1.6503818035125732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1640765517950058, "step": 16150 }, { "epoch": 0.32304, "grad_norm": 2.015625, "grad_norm_var": 0.02520726521809896, "learning_rate": 0.0001, "loss": 4.0692, "loss/crossentropy": 1.9449399709701538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1876515969634056, "step": 16152 }, { "epoch": 0.32308, "grad_norm": 1.8984375, "grad_norm_var": 0.022581990559895834, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 2.001387894153595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21191075444221497, "step": 16154 }, { "epoch": 0.32312, "grad_norm": 2.234375, "grad_norm_var": 0.0143218994140625, "learning_rate": 0.0001, "loss": 4.2115, "loss/crossentropy": 2.1146541833877563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2692951038479805, "step": 16156 }, { "epoch": 0.32316, "grad_norm": 1.9609375, "grad_norm_var": 0.015602366129557291, "learning_rate": 0.0001, "loss": 4.1534, "loss/crossentropy": 2.0765844583511353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19518104195594788, "step": 16158 }, { "epoch": 0.3232, "grad_norm": 2.0, "grad_norm_var": 0.0128814697265625, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 2.5394046306610107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22651594877243042, "step": 16160 }, { "epoch": 0.32324, "grad_norm": 1.9609375, "grad_norm_var": 0.011702473958333333, "learning_rate": 0.0001, "loss": 4.0412, "loss/crossentropy": 2.0140721797943115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18245946615934372, "step": 16162 }, { "epoch": 0.32328, "grad_norm": 1.9375, "grad_norm_var": 0.012303670247395834, "learning_rate": 0.0001, "loss": 4.3029, "loss/crossentropy": 2.472638249397278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23127944022417068, "step": 16164 }, { "epoch": 0.32332, "grad_norm": 2.03125, "grad_norm_var": 0.014924112955729167, "learning_rate": 0.0001, "loss": 4.1974, "loss/crossentropy": 2.1551883220672607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093031033873558, "step": 16166 }, { "epoch": 0.32336, "grad_norm": 2.25, "grad_norm_var": 0.01641845703125, "learning_rate": 0.0001, "loss": 4.1221, "loss/crossentropy": 2.042950928211212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049141526222229, "step": 16168 }, { "epoch": 0.3234, "grad_norm": 2.03125, "grad_norm_var": 0.014849599202473958, "learning_rate": 0.0001, "loss": 4.4795, "loss/crossentropy": 2.0262559056282043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18649922311306, "step": 16170 }, { "epoch": 0.32344, "grad_norm": 1.984375, "grad_norm_var": 0.011675771077473958, "learning_rate": 0.0001, "loss": 4.0477, "loss/crossentropy": 2.3014276027679443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21770215779542923, "step": 16172 }, { "epoch": 0.32348, "grad_norm": 2.078125, "grad_norm_var": 0.009349568684895834, "learning_rate": 0.0001, "loss": 4.2802, "loss/crossentropy": 2.213370680809021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18508044630289078, "step": 16174 }, { "epoch": 0.32352, "grad_norm": 2.03125, "grad_norm_var": 0.009075673421223958, "learning_rate": 0.0001, "loss": 4.2529, "loss/crossentropy": 1.973215639591217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937488242983818, "step": 16176 }, { "epoch": 0.32356, "grad_norm": 2.09375, "grad_norm_var": 0.018553670247395834, "learning_rate": 0.0001, "loss": 4.3534, "loss/crossentropy": 2.1585731506347656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029733881354332, "step": 16178 }, { "epoch": 0.3236, "grad_norm": 2.359375, "grad_norm_var": 0.021647135416666668, "learning_rate": 0.0001, "loss": 4.3324, "loss/crossentropy": 2.3878796100616455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985762238502502, "step": 16180 }, { "epoch": 0.32364, "grad_norm": 1.84375, "grad_norm_var": 0.027860514322916665, "learning_rate": 0.0001, "loss": 3.8043, "loss/crossentropy": 2.07839834690094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994347795844078, "step": 16182 }, { "epoch": 0.32368, "grad_norm": 1.953125, "grad_norm_var": 0.02607421875, "learning_rate": 0.0001, "loss": 4.3147, "loss/crossentropy": 2.1800806522369385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21444538235664368, "step": 16184 }, { "epoch": 0.32372, "grad_norm": 1.8984375, "grad_norm_var": 0.029352823893229168, "learning_rate": 0.0001, "loss": 3.9927, "loss/crossentropy": 2.0900917053222656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20476030558347702, "step": 16186 }, { "epoch": 0.32376, "grad_norm": 2.0625, "grad_norm_var": 0.02899169921875, "learning_rate": 0.0001, "loss": 4.209, "loss/crossentropy": 2.1523303985595703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20984850078821182, "step": 16188 }, { "epoch": 0.3238, "grad_norm": 1.765625, "grad_norm_var": 0.03593648274739583, "learning_rate": 0.0001, "loss": 3.799, "loss/crossentropy": 1.8808646202087402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18587414175271988, "step": 16190 }, { "epoch": 0.32384, "grad_norm": 1.953125, "grad_norm_var": 0.03593648274739583, "learning_rate": 0.0001, "loss": 4.1002, "loss/crossentropy": 1.981968104839325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20611849427223206, "step": 16192 }, { "epoch": 0.32388, "grad_norm": 2.03125, "grad_norm_var": 0.020951334635416666, "learning_rate": 0.0001, "loss": 4.2811, "loss/crossentropy": 2.1454352140426636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21696189790964127, "step": 16194 }, { "epoch": 0.32392, "grad_norm": 2.078125, "grad_norm_var": 0.011905924479166666, "learning_rate": 0.0001, "loss": 4.159, "loss/crossentropy": 2.026822328567505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112257331609726, "step": 16196 }, { "epoch": 0.32396, "grad_norm": 1.9609375, "grad_norm_var": 0.009706370035807292, "learning_rate": 0.0001, "loss": 4.0838, "loss/crossentropy": 2.332329034805298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20129209011793137, "step": 16198 }, { "epoch": 0.324, "grad_norm": 2.046875, "grad_norm_var": 0.007260894775390625, "learning_rate": 0.0001, "loss": 4.11, "loss/crossentropy": 2.271657705307007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460309863090515, "step": 16200 }, { "epoch": 0.32404, "grad_norm": 2.0, "grad_norm_var": 0.0064084370930989586, "learning_rate": 0.0001, "loss": 4.0835, "loss/crossentropy": 2.0890920162200928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132192626595497, "step": 16202 }, { "epoch": 0.32408, "grad_norm": 1.9453125, "grad_norm_var": 0.007226308186848958, "learning_rate": 0.0001, "loss": 4.0271, "loss/crossentropy": 1.8368538618087769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891050636768341, "step": 16204 }, { "epoch": 0.32412, "grad_norm": 2.0625, "grad_norm_var": 0.004375966389973959, "learning_rate": 0.0001, "loss": 4.2859, "loss/crossentropy": 2.3123772144317627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22660605609416962, "step": 16206 }, { "epoch": 0.32416, "grad_norm": 2.125, "grad_norm_var": 0.005411529541015625, "learning_rate": 0.0001, "loss": 4.0989, "loss/crossentropy": 2.21794331073761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21767260879278183, "step": 16208 }, { "epoch": 0.3242, "grad_norm": 2.0625, "grad_norm_var": 0.010591379801432292, "learning_rate": 0.0001, "loss": 4.5407, "loss/crossentropy": 2.169134736061096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172672376036644, "step": 16210 }, { "epoch": 0.32424, "grad_norm": 1.921875, "grad_norm_var": 0.011331939697265625, "learning_rate": 0.0001, "loss": 4.0327, "loss/crossentropy": 2.2830699682235718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22920189797878265, "step": 16212 }, { "epoch": 0.32428, "grad_norm": 1.90625, "grad_norm_var": 0.012770334879557291, "learning_rate": 0.0001, "loss": 3.9278, "loss/crossentropy": 2.024571657180786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21096136420965195, "step": 16214 }, { "epoch": 0.32432, "grad_norm": 1.9375, "grad_norm_var": 0.013057200113932292, "learning_rate": 0.0001, "loss": 4.0244, "loss/crossentropy": 2.060720980167389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21043655276298523, "step": 16216 }, { "epoch": 0.32436, "grad_norm": 2.078125, "grad_norm_var": 0.014134724934895834, "learning_rate": 0.0001, "loss": 4.074, "loss/crossentropy": 2.0332056283950806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19506454467773438, "step": 16218 }, { "epoch": 0.3244, "grad_norm": 1.8515625, "grad_norm_var": 0.014021555582682291, "learning_rate": 0.0001, "loss": 3.8173, "loss/crossentropy": 1.811396062374115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19359800219535828, "step": 16220 }, { "epoch": 0.32444, "grad_norm": 1.9453125, "grad_norm_var": 0.013117472330729166, "learning_rate": 0.0001, "loss": 4.1846, "loss/crossentropy": 2.074379801750183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21763048321008682, "step": 16222 }, { "epoch": 0.32448, "grad_norm": 2.203125, "grad_norm_var": 0.014682769775390625, "learning_rate": 0.0001, "loss": 4.1984, "loss/crossentropy": 2.1836538314819336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20814575254917145, "step": 16224 }, { "epoch": 0.32452, "grad_norm": 1.953125, "grad_norm_var": 0.010794830322265626, "learning_rate": 0.0001, "loss": 4.2866, "loss/crossentropy": 2.365849494934082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24219170212745667, "step": 16226 }, { "epoch": 0.32456, "grad_norm": 2.0625, "grad_norm_var": 0.010343424479166667, "learning_rate": 0.0001, "loss": 4.1976, "loss/crossentropy": 1.8738153576850891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22152718901634216, "step": 16228 }, { "epoch": 0.3246, "grad_norm": 1.9921875, "grad_norm_var": 0.010209147135416667, "learning_rate": 0.0001, "loss": 4.1458, "loss/crossentropy": 2.2568124532699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21668314933776855, "step": 16230 }, { "epoch": 0.32464, "grad_norm": 2.109375, "grad_norm_var": 0.010900624593098958, "learning_rate": 0.0001, "loss": 4.2889, "loss/crossentropy": 2.4037723541259766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2225075364112854, "step": 16232 }, { "epoch": 0.32468, "grad_norm": 1.765625, "grad_norm_var": 0.01375732421875, "learning_rate": 0.0001, "loss": 3.7288, "loss/crossentropy": 2.2305864095687866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19381006807088852, "step": 16234 }, { "epoch": 0.32472, "grad_norm": 2.03125, "grad_norm_var": 0.013862864176432291, "learning_rate": 0.0001, "loss": 4.186, "loss/crossentropy": 2.1115931272506714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20033665746450424, "step": 16236 }, { "epoch": 0.32476, "grad_norm": 1.9765625, "grad_norm_var": 0.014054361979166667, "learning_rate": 0.0001, "loss": 4.1602, "loss/crossentropy": 2.0450429916381836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2080780565738678, "step": 16238 }, { "epoch": 0.3248, "grad_norm": 1.9609375, "grad_norm_var": 0.011888631184895833, "learning_rate": 0.0001, "loss": 4.056, "loss/crossentropy": 2.003769636154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20560404658317566, "step": 16240 }, { "epoch": 0.32484, "grad_norm": 2.125, "grad_norm_var": 0.010383860270182291, "learning_rate": 0.0001, "loss": 4.3504, "loss/crossentropy": 1.9452934265136719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063707485795021, "step": 16242 }, { "epoch": 0.32488, "grad_norm": 1.7734375, "grad_norm_var": 0.012497711181640624, "learning_rate": 0.0001, "loss": 3.9172, "loss/crossentropy": 2.1412742137908936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20292682200670242, "step": 16244 }, { "epoch": 0.32492, "grad_norm": 1.96875, "grad_norm_var": 0.011140950520833333, "learning_rate": 0.0001, "loss": 4.3025, "loss/crossentropy": 2.2285404205322266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22571790218353271, "step": 16246 }, { "epoch": 0.32496, "grad_norm": 1.8046875, "grad_norm_var": 0.010713704427083333, "learning_rate": 0.0001, "loss": 3.6582, "loss/crossentropy": 1.82416570186615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1771545708179474, "step": 16248 }, { "epoch": 0.325, "grad_norm": 2.109375, "grad_norm_var": 0.010422515869140624, "learning_rate": 0.0001, "loss": 4.2533, "loss/crossentropy": 2.1432559490203857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20037438720464706, "step": 16250 }, { "epoch": 0.32504, "grad_norm": 2.234375, "grad_norm_var": 0.015337880452473958, "learning_rate": 0.0001, "loss": 4.162, "loss/crossentropy": 2.1962517499923706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22743911296129227, "step": 16252 }, { "epoch": 0.32508, "grad_norm": 2.125, "grad_norm_var": 0.01636530558268229, "learning_rate": 0.0001, "loss": 4.2223, "loss/crossentropy": 1.932455599308014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043299823999405, "step": 16254 }, { "epoch": 0.32512, "grad_norm": 1.9453125, "grad_norm_var": 0.015824127197265624, "learning_rate": 0.0001, "loss": 3.9024, "loss/crossentropy": 1.9702956676483154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19528628885746002, "step": 16256 }, { "epoch": 0.32516, "grad_norm": 2.296875, "grad_norm_var": 0.020475260416666665, "learning_rate": 0.0001, "loss": 4.0628, "loss/crossentropy": 2.0042436718940735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19861597567796707, "step": 16258 }, { "epoch": 0.3252, "grad_norm": 2.28125, "grad_norm_var": 0.019573720296223958, "learning_rate": 0.0001, "loss": 4.1876, "loss/crossentropy": 2.3544296622276306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22385136783123016, "step": 16260 }, { "epoch": 0.32524, "grad_norm": 1.984375, "grad_norm_var": 0.01942723592122396, "learning_rate": 0.0001, "loss": 4.1435, "loss/crossentropy": 2.1482596397399902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21662700176239014, "step": 16262 }, { "epoch": 0.32528, "grad_norm": 1.96875, "grad_norm_var": 0.01395263671875, "learning_rate": 0.0001, "loss": 4.3441, "loss/crossentropy": 2.243022322654724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22729884833097458, "step": 16264 }, { "epoch": 0.32532, "grad_norm": 1.875, "grad_norm_var": 0.018656158447265626, "learning_rate": 0.0001, "loss": 3.9013, "loss/crossentropy": 2.2499197721481323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20558901876211166, "step": 16266 }, { "epoch": 0.32536, "grad_norm": 1.9140625, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 4.2313, "loss/crossentropy": 1.6232356429100037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18864237517118454, "step": 16268 }, { "epoch": 0.3254, "grad_norm": 1.96875, "grad_norm_var": 0.02402521769205729, "learning_rate": 0.0001, "loss": 4.0435, "loss/crossentropy": 1.559517502784729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1704516038298607, "step": 16270 }, { "epoch": 0.32544, "grad_norm": 1.890625, "grad_norm_var": 0.02484308878580729, "learning_rate": 0.0001, "loss": 3.9018, "loss/crossentropy": 1.7346046566963196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17380736768245697, "step": 16272 }, { "epoch": 0.32548, "grad_norm": 2.21875, "grad_norm_var": 0.02237523396809896, "learning_rate": 0.0001, "loss": 4.2894, "loss/crossentropy": 2.1451315879821777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21778792887926102, "step": 16274 }, { "epoch": 0.32552, "grad_norm": 2.25, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 3.8737, "loss/crossentropy": 1.8013367056846619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18631768971681595, "step": 16276 }, { "epoch": 0.32556, "grad_norm": 1.9609375, "grad_norm_var": 0.025655110677083332, "learning_rate": 0.0001, "loss": 3.8573, "loss/crossentropy": 1.8858280181884766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1940329447388649, "step": 16278 }, { "epoch": 0.3256, "grad_norm": 1.9375, "grad_norm_var": 0.0259429931640625, "learning_rate": 0.0001, "loss": 4.1268, "loss/crossentropy": 1.8475046157836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18596403300762177, "step": 16280 }, { "epoch": 0.32564, "grad_norm": 2.0, "grad_norm_var": 0.023636881510416666, "learning_rate": 0.0001, "loss": 4.2602, "loss/crossentropy": 2.2585566639900208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22430463135242462, "step": 16282 }, { "epoch": 0.32568, "grad_norm": 1.8984375, "grad_norm_var": 0.015900675455729166, "learning_rate": 0.0001, "loss": 3.7224, "loss/crossentropy": 1.7233783602714539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17318468540906906, "step": 16284 }, { "epoch": 0.32572, "grad_norm": 1.9921875, "grad_norm_var": 0.022172037760416666, "learning_rate": 0.0001, "loss": 4.3042, "loss/crossentropy": 2.174088716506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21329404413700104, "step": 16286 }, { "epoch": 0.32576, "grad_norm": 2.015625, "grad_norm_var": 0.022345987955729167, "learning_rate": 0.0001, "loss": 3.8932, "loss/crossentropy": 1.8016705513000488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20395102351903915, "step": 16288 }, { "epoch": 0.3258, "grad_norm": 2.03125, "grad_norm_var": 0.020182037353515626, "learning_rate": 0.0001, "loss": 3.8634, "loss/crossentropy": 2.166901111602783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21237251162528992, "step": 16290 }, { "epoch": 0.32584, "grad_norm": 1.9921875, "grad_norm_var": 0.014766184488932292, "learning_rate": 0.0001, "loss": 4.07, "loss/crossentropy": 2.024773359298706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973796784877777, "step": 16292 }, { "epoch": 0.32588, "grad_norm": 2.203125, "grad_norm_var": 0.015466054280598959, "learning_rate": 0.0001, "loss": 4.173, "loss/crossentropy": 2.457033157348633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21896487474441528, "step": 16294 }, { "epoch": 0.32592, "grad_norm": 2.03125, "grad_norm_var": 0.014180501302083334, "learning_rate": 0.0001, "loss": 4.11, "loss/crossentropy": 2.4872595071792603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2348102256655693, "step": 16296 }, { "epoch": 0.32596, "grad_norm": 2.03125, "grad_norm_var": 0.014249420166015625, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 2.110987067222595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21079359203577042, "step": 16298 }, { "epoch": 0.326, "grad_norm": 1.9375, "grad_norm_var": 0.0133941650390625, "learning_rate": 0.0001, "loss": 4.2334, "loss/crossentropy": 2.2166510820388794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19656993448734283, "step": 16300 }, { "epoch": 0.32604, "grad_norm": 2.046875, "grad_norm_var": 0.008125813802083333, "learning_rate": 0.0001, "loss": 4.0212, "loss/crossentropy": 2.014153838157654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956234946846962, "step": 16302 }, { "epoch": 0.32608, "grad_norm": 2.671875, "grad_norm_var": 0.03588231404622396, "learning_rate": 0.0001, "loss": 4.3045, "loss/crossentropy": 2.298485517501831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710477769374847, "step": 16304 }, { "epoch": 0.32612, "grad_norm": 1.9921875, "grad_norm_var": 0.033599599202473955, "learning_rate": 0.0001, "loss": 4.3134, "loss/crossentropy": 2.233761191368103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21609390527009964, "step": 16306 }, { "epoch": 0.32616, "grad_norm": 1.8515625, "grad_norm_var": 0.03675918579101563, "learning_rate": 0.0001, "loss": 3.9141, "loss/crossentropy": 1.9988956451416016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924634352326393, "step": 16308 }, { "epoch": 0.3262, "grad_norm": 1.96875, "grad_norm_var": 0.035194651285807295, "learning_rate": 0.0001, "loss": 3.9726, "loss/crossentropy": 1.9185590147972107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20295168459415436, "step": 16310 }, { "epoch": 0.32624, "grad_norm": 2.0625, "grad_norm_var": 0.03476155598958333, "learning_rate": 0.0001, "loss": 4.0805, "loss/crossentropy": 2.219870448112488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21425887942314148, "step": 16312 }, { "epoch": 0.32628, "grad_norm": 2.21875, "grad_norm_var": 0.03612442016601562, "learning_rate": 0.0001, "loss": 4.5119, "loss/crossentropy": 2.219307541847229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22380203753709793, "step": 16314 }, { "epoch": 0.32632, "grad_norm": 1.875, "grad_norm_var": 0.03746515909830729, "learning_rate": 0.0001, "loss": 3.9652, "loss/crossentropy": 1.7907224893569946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18037232011556625, "step": 16316 }, { "epoch": 0.32636, "grad_norm": 1.8203125, "grad_norm_var": 0.03876927693684896, "learning_rate": 0.0001, "loss": 4.1337, "loss/crossentropy": 1.9702708721160889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18266448378562927, "step": 16318 }, { "epoch": 0.3264, "grad_norm": 1.90625, "grad_norm_var": 0.012481435139973959, "learning_rate": 0.0001, "loss": 4.0551, "loss/crossentropy": 2.1748844385147095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20725325495004654, "step": 16320 }, { "epoch": 0.32644, "grad_norm": 1.953125, "grad_norm_var": 0.0119781494140625, "learning_rate": 0.0001, "loss": 4.2882, "loss/crossentropy": 2.2458006143569946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21462788432836533, "step": 16322 }, { "epoch": 0.32648, "grad_norm": 1.9296875, "grad_norm_var": 0.009276326497395833, "learning_rate": 0.0001, "loss": 4.1231, "loss/crossentropy": 1.923665463924408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223393976688385, "step": 16324 }, { "epoch": 0.32652, "grad_norm": 1.9296875, "grad_norm_var": 0.010129547119140625, "learning_rate": 0.0001, "loss": 4.1643, "loss/crossentropy": 2.36092209815979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22316965460777283, "step": 16326 }, { "epoch": 0.32656, "grad_norm": 2.046875, "grad_norm_var": 0.010456339518229166, "learning_rate": 0.0001, "loss": 4.0778, "loss/crossentropy": 1.7765440344810486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736016422510147, "step": 16328 }, { "epoch": 0.3266, "grad_norm": 1.9609375, "grad_norm_var": 0.006638336181640625, "learning_rate": 0.0001, "loss": 4.4885, "loss/crossentropy": 2.2588138580322266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21762139350175858, "step": 16330 }, { "epoch": 0.32664, "grad_norm": 1.921875, "grad_norm_var": 0.0064084370930989586, "learning_rate": 0.0001, "loss": 4.2476, "loss/crossentropy": 2.177221417427063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19594799727201462, "step": 16332 }, { "epoch": 0.32668, "grad_norm": 2.0625, "grad_norm_var": 0.0051422119140625, "learning_rate": 0.0001, "loss": 3.941, "loss/crossentropy": 1.8424673676490784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19562739878892899, "step": 16334 }, { "epoch": 0.32672, "grad_norm": 1.8515625, "grad_norm_var": 0.006473541259765625, "learning_rate": 0.0001, "loss": 3.861, "loss/crossentropy": 1.8677524328231812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19708558171987534, "step": 16336 }, { "epoch": 0.32676, "grad_norm": 1.859375, "grad_norm_var": 0.007081858317057292, "learning_rate": 0.0001, "loss": 3.9222, "loss/crossentropy": 1.9559081196784973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19585052132606506, "step": 16338 }, { "epoch": 0.3268, "grad_norm": 1.9765625, "grad_norm_var": 0.006892649332682291, "learning_rate": 0.0001, "loss": 4.144, "loss/crossentropy": 1.979565978050232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045053392648697, "step": 16340 }, { "epoch": 0.32684, "grad_norm": 1.953125, "grad_norm_var": 0.00714111328125, "learning_rate": 0.0001, "loss": 4.2517, "loss/crossentropy": 2.296669840812683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207671657204628, "step": 16342 }, { "epoch": 0.32688, "grad_norm": 2.0, "grad_norm_var": 0.0065305074055989586, "learning_rate": 0.0001, "loss": 4.0279, "loss/crossentropy": 1.9962339401245117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19833384454250336, "step": 16344 }, { "epoch": 0.32692, "grad_norm": 2.015625, "grad_norm_var": 0.006078084309895833, "learning_rate": 0.0001, "loss": 4.1779, "loss/crossentropy": 2.020018517971039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19721734523773193, "step": 16346 }, { "epoch": 0.32696, "grad_norm": 1.8828125, "grad_norm_var": 0.006815592447916667, "learning_rate": 0.0001, "loss": 3.7614, "loss/crossentropy": 2.112026810646057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20460353046655655, "step": 16348 }, { "epoch": 0.327, "grad_norm": 1.9453125, "grad_norm_var": 0.006540679931640625, "learning_rate": 0.0001, "loss": 4.1509, "loss/crossentropy": 2.2069387435913086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233232483267784, "step": 16350 }, { "epoch": 0.32704, "grad_norm": 1.9375, "grad_norm_var": 0.004792277018229167, "learning_rate": 0.0001, "loss": 3.8675, "loss/crossentropy": 1.9365113377571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19625376164913177, "step": 16352 }, { "epoch": 0.32708, "grad_norm": 2.125, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 4.1831, "loss/crossentropy": 2.2316179871559143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22712747752666473, "step": 16354 }, { "epoch": 0.32712, "grad_norm": 2.0, "grad_norm_var": 0.035672760009765624, "learning_rate": 0.0001, "loss": 3.9148, "loss/crossentropy": 1.801742434501648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1831187978386879, "step": 16356 }, { "epoch": 0.32716, "grad_norm": 1.9921875, "grad_norm_var": 0.03590087890625, "learning_rate": 0.0001, "loss": 3.9819, "loss/crossentropy": 2.157936453819275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21520215272903442, "step": 16358 }, { "epoch": 0.3272, "grad_norm": 2.03125, "grad_norm_var": 0.035835520426432295, "learning_rate": 0.0001, "loss": 4.0743, "loss/crossentropy": 1.6054654717445374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1769917830824852, "step": 16360 }, { "epoch": 0.32724, "grad_norm": 2.15625, "grad_norm_var": 0.039613596598307294, "learning_rate": 0.0001, "loss": 4.0751, "loss/crossentropy": 2.0573782324790955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22550390660762787, "step": 16362 }, { "epoch": 0.32728, "grad_norm": 2.03125, "grad_norm_var": 0.035131581624348956, "learning_rate": 0.0001, "loss": 4.281, "loss/crossentropy": 2.0643117427825928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21060281991958618, "step": 16364 }, { "epoch": 0.32732, "grad_norm": 1.8984375, "grad_norm_var": 0.042525227864583334, "learning_rate": 0.0001, "loss": 4.0884, "loss/crossentropy": 2.1980225443840027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19246292114257812, "step": 16366 }, { "epoch": 0.32736, "grad_norm": 2.078125, "grad_norm_var": 0.042577107747395836, "learning_rate": 0.0001, "loss": 4.05, "loss/crossentropy": 1.8917307257652283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17911095917224884, "step": 16368 }, { "epoch": 0.3274, "grad_norm": 2.15625, "grad_norm_var": 0.0429595947265625, "learning_rate": 0.0001, "loss": 4.1415, "loss/crossentropy": 2.0520957708358765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19265015423297882, "step": 16370 }, { "epoch": 0.32744, "grad_norm": 2.28125, "grad_norm_var": 0.019510904947916668, "learning_rate": 0.0001, "loss": 4.3026, "loss/crossentropy": 2.4500341415405273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21713324636220932, "step": 16372 }, { "epoch": 0.32748, "grad_norm": 1.8828125, "grad_norm_var": 0.02032648722330729, "learning_rate": 0.0001, "loss": 4.0754, "loss/crossentropy": 1.8100037574768066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16967355459928513, "step": 16374 }, { "epoch": 0.32752, "grad_norm": 1.9375, "grad_norm_var": 0.021174875895182292, "learning_rate": 0.0001, "loss": 4.0244, "loss/crossentropy": 2.092597723007202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998964473605156, "step": 16376 }, { "epoch": 0.32756, "grad_norm": 2.03125, "grad_norm_var": 0.016076405843098957, "learning_rate": 0.0001, "loss": 4.3627, "loss/crossentropy": 2.151292622089386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19120946526527405, "step": 16378 }, { "epoch": 0.3276, "grad_norm": 1.796875, "grad_norm_var": 0.020906575520833335, "learning_rate": 0.0001, "loss": 3.6949, "loss/crossentropy": 1.7384315729141235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18261852860450745, "step": 16380 }, { "epoch": 0.32764, "grad_norm": 1.953125, "grad_norm_var": 0.019147745768229165, "learning_rate": 0.0001, "loss": 3.9341, "loss/crossentropy": 1.662436068058014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16703163087368011, "step": 16382 }, { "epoch": 0.32768, "grad_norm": 1.984375, "grad_norm_var": 0.018871053059895834, "learning_rate": 0.0001, "loss": 4.3202, "loss/crossentropy": 2.2057151794433594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20989537239074707, "step": 16384 }, { "epoch": 0.32772, "grad_norm": 2.03125, "grad_norm_var": 0.068359375, "learning_rate": 0.0001, "loss": 4.3033, "loss/crossentropy": 2.1689382791519165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20804189145565033, "step": 16386 }, { "epoch": 0.32776, "grad_norm": 2.171875, "grad_norm_var": 0.06450169881184896, "learning_rate": 0.0001, "loss": 4.2241, "loss/crossentropy": 2.3121412992477417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20552029460668564, "step": 16388 }, { "epoch": 0.3278, "grad_norm": 2.046875, "grad_norm_var": 0.06318333943684896, "learning_rate": 0.0001, "loss": 4.1396, "loss/crossentropy": 2.0784353017807007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20427466928958893, "step": 16390 }, { "epoch": 0.32784, "grad_norm": 1.9609375, "grad_norm_var": 0.06282145182291667, "learning_rate": 0.0001, "loss": 4.1132, "loss/crossentropy": 2.147130608558655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181989550590515, "step": 16392 }, { "epoch": 0.32788, "grad_norm": 1.9296875, "grad_norm_var": 0.06468505859375, "learning_rate": 0.0001, "loss": 3.812, "loss/crossentropy": 1.7986091375350952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18294771760702133, "step": 16394 }, { "epoch": 0.32792, "grad_norm": 1.9140625, "grad_norm_var": 0.059004720052083334, "learning_rate": 0.0001, "loss": 4.037, "loss/crossentropy": 2.0563793778419495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19180169701576233, "step": 16396 }, { "epoch": 0.32796, "grad_norm": 2.015625, "grad_norm_var": 0.05506591796875, "learning_rate": 0.0001, "loss": 4.3111, "loss/crossentropy": 2.462133765220642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21563740074634552, "step": 16398 }, { "epoch": 0.328, "grad_norm": 1.9375, "grad_norm_var": 0.059427897135416664, "learning_rate": 0.0001, "loss": 3.9743, "loss/crossentropy": 1.8900938630104065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845797374844551, "step": 16400 }, { "epoch": 0.32804, "grad_norm": 2.375, "grad_norm_var": 0.0195068359375, "learning_rate": 0.0001, "loss": 4.1653, "loss/crossentropy": 2.0627527832984924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20268447697162628, "step": 16402 }, { "epoch": 0.32808, "grad_norm": 2.125, "grad_norm_var": 0.030147043863932292, "learning_rate": 0.0001, "loss": 4.3042, "loss/crossentropy": 2.058929443359375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19984640181064606, "step": 16404 }, { "epoch": 0.32812, "grad_norm": 1.9296875, "grad_norm_var": 0.030564117431640624, "learning_rate": 0.0001, "loss": 4.1162, "loss/crossentropy": 1.816510558128357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1754368245601654, "step": 16406 }, { "epoch": 0.32816, "grad_norm": 2.015625, "grad_norm_var": 0.0302886962890625, "learning_rate": 0.0001, "loss": 4.263, "loss/crossentropy": 1.9079806208610535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18917685747146606, "step": 16408 }, { "epoch": 0.3282, "grad_norm": 1.9765625, "grad_norm_var": 0.028612263997395835, "learning_rate": 0.0001, "loss": 3.9264, "loss/crossentropy": 2.0913302898406982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088220864534378, "step": 16410 }, { "epoch": 0.32824, "grad_norm": 2.09375, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 3.9029, "loss/crossentropy": 2.1609301567077637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010297030210495, "step": 16412 }, { "epoch": 0.32828, "grad_norm": 3.078125, "grad_norm_var": 0.0982666015625, "learning_rate": 0.0001, "loss": 4.2563, "loss/crossentropy": 1.8449677228927612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23620254546403885, "step": 16414 }, { "epoch": 0.32832, "grad_norm": 2.078125, "grad_norm_var": 0.09134699503580729, "learning_rate": 0.0001, "loss": 4.1053, "loss/crossentropy": 2.059410274028778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20259736478328705, "step": 16416 }, { "epoch": 0.32836, "grad_norm": 1.8828125, "grad_norm_var": 0.08761571248372396, "learning_rate": 0.0001, "loss": 3.8718, "loss/crossentropy": 1.6095005869865417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17026344686746597, "step": 16418 }, { "epoch": 0.3284, "grad_norm": 2.03125, "grad_norm_var": 0.07932510375976562, "learning_rate": 0.0001, "loss": 4.2623, "loss/crossentropy": 2.200170636177063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2259170413017273, "step": 16420 }, { "epoch": 0.32844, "grad_norm": 1.921875, "grad_norm_var": 0.08116226196289063, "learning_rate": 0.0001, "loss": 4.0724, "loss/crossentropy": 1.970030963420868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19053932279348373, "step": 16422 }, { "epoch": 0.32848, "grad_norm": 1.890625, "grad_norm_var": 0.08263753255208334, "learning_rate": 0.0001, "loss": 4.106, "loss/crossentropy": 2.2675901651382446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22277145832777023, "step": 16424 }, { "epoch": 0.32852, "grad_norm": 1.78125, "grad_norm_var": 0.08910725911458334, "learning_rate": 0.0001, "loss": 3.723, "loss/crossentropy": 1.6446372866630554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15234287828207016, "step": 16426 }, { "epoch": 0.32856, "grad_norm": 1.984375, "grad_norm_var": 0.0875689188639323, "learning_rate": 0.0001, "loss": 4.3421, "loss/crossentropy": 2.350424647331238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190476581454277, "step": 16428 }, { "epoch": 0.3286, "grad_norm": 2.046875, "grad_norm_var": 0.009279123942057292, "learning_rate": 0.0001, "loss": 3.9663, "loss/crossentropy": 1.8809763193130493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19404269009828568, "step": 16430 }, { "epoch": 0.32864, "grad_norm": 1.9296875, "grad_norm_var": 0.008257802327473958, "learning_rate": 0.0001, "loss": 4.1731, "loss/crossentropy": 2.1159849166870117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21232837438583374, "step": 16432 }, { "epoch": 0.32868, "grad_norm": 2.421875, "grad_norm_var": 0.02133763631184896, "learning_rate": 0.0001, "loss": 4.2875, "loss/crossentropy": 2.3252066373825073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24867044389247894, "step": 16434 }, { "epoch": 0.32872, "grad_norm": 1.9140625, "grad_norm_var": 0.020186360677083334, "learning_rate": 0.0001, "loss": 4.2462, "loss/crossentropy": 2.0642696619033813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18409788608551025, "step": 16436 }, { "epoch": 0.32876, "grad_norm": 2.203125, "grad_norm_var": 0.022725423177083332, "learning_rate": 0.0001, "loss": 4.3832, "loss/crossentropy": 2.359580874443054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218842551112175, "step": 16438 }, { "epoch": 0.3288, "grad_norm": 1.8984375, "grad_norm_var": 0.02264378865559896, "learning_rate": 0.0001, "loss": 3.7996, "loss/crossentropy": 2.0959436893463135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993689462542534, "step": 16440 }, { "epoch": 0.32884, "grad_norm": 1.96875, "grad_norm_var": 0.017679595947265626, "learning_rate": 0.0001, "loss": 4.0838, "loss/crossentropy": 2.1507667303085327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20800111442804337, "step": 16442 }, { "epoch": 0.32888, "grad_norm": 1.875, "grad_norm_var": 0.020005035400390624, "learning_rate": 0.0001, "loss": 3.9428, "loss/crossentropy": 1.8771533370018005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20476838946342468, "step": 16444 }, { "epoch": 0.32892, "grad_norm": 2.046875, "grad_norm_var": 0.020005035400390624, "learning_rate": 0.0001, "loss": 4.2689, "loss/crossentropy": 2.365285038948059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22512932121753693, "step": 16446 }, { "epoch": 0.32896, "grad_norm": 2.0625, "grad_norm_var": 0.019701131184895835, "learning_rate": 0.0001, "loss": 4.2928, "loss/crossentropy": 2.2584418058395386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21187593042850494, "step": 16448 }, { "epoch": 0.329, "grad_norm": 1.9921875, "grad_norm_var": 0.010270182291666667, "learning_rate": 0.0001, "loss": 3.8186, "loss/crossentropy": 1.9227403402328491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945207566022873, "step": 16450 }, { "epoch": 0.32904, "grad_norm": 1.9296875, "grad_norm_var": 0.011897786458333334, "learning_rate": 0.0001, "loss": 3.8512, "loss/crossentropy": 2.062865734100342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064652517437935, "step": 16452 }, { "epoch": 0.32908, "grad_norm": 1.9140625, "grad_norm_var": 0.007505035400390625, "learning_rate": 0.0001, "loss": 4.0493, "loss/crossentropy": 1.98399817943573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22567245364189148, "step": 16454 }, { "epoch": 0.32912, "grad_norm": 2.0, "grad_norm_var": 0.007428995768229167, "learning_rate": 0.0001, "loss": 4.2888, "loss/crossentropy": 2.0655510425567627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20936425030231476, "step": 16456 }, { "epoch": 0.32916, "grad_norm": 2.140625, "grad_norm_var": 0.01739501953125, "learning_rate": 0.0001, "loss": 4.1486, "loss/crossentropy": 1.8360095024108887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18788952380418777, "step": 16458 }, { "epoch": 0.3292, "grad_norm": 1.9453125, "grad_norm_var": 0.01672948201497396, "learning_rate": 0.0001, "loss": 4.2265, "loss/crossentropy": 2.1088568568229675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20265556871891022, "step": 16460 }, { "epoch": 0.32924, "grad_norm": 1.921875, "grad_norm_var": 0.017601521809895833, "learning_rate": 0.0001, "loss": 4.1004, "loss/crossentropy": 2.3746429681777954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21173583716154099, "step": 16462 }, { "epoch": 0.32928, "grad_norm": 2.078125, "grad_norm_var": 0.01946996053059896, "learning_rate": 0.0001, "loss": 3.9577, "loss/crossentropy": 2.0129969716072083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18141476809978485, "step": 16464 }, { "epoch": 0.32932, "grad_norm": 1.9921875, "grad_norm_var": 0.019160970052083334, "learning_rate": 0.0001, "loss": 4.2122, "loss/crossentropy": 2.385707139968872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22456685453653336, "step": 16466 }, { "epoch": 0.32936, "grad_norm": 2.09375, "grad_norm_var": 0.015958658854166665, "learning_rate": 0.0001, "loss": 4.183, "loss/crossentropy": 2.466804623603821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23166514188051224, "step": 16468 }, { "epoch": 0.3294, "grad_norm": 1.8515625, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 3.8552, "loss/crossentropy": 1.9182489514350891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918247565627098, "step": 16470 }, { "epoch": 0.32944, "grad_norm": 2.125, "grad_norm_var": 0.021993001302083332, "learning_rate": 0.0001, "loss": 4.4364, "loss/crossentropy": 2.27492892742157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2237030565738678, "step": 16472 }, { "epoch": 0.32948, "grad_norm": 2.03125, "grad_norm_var": 0.0144927978515625, "learning_rate": 0.0001, "loss": 4.0022, "loss/crossentropy": 1.856327474117279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17718566209077835, "step": 16474 }, { "epoch": 0.32952, "grad_norm": 1.859375, "grad_norm_var": 0.0145263671875, "learning_rate": 0.0001, "loss": 3.9994, "loss/crossentropy": 2.014274477958679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20536774396896362, "step": 16476 }, { "epoch": 0.32956, "grad_norm": 2.03125, "grad_norm_var": 0.016383616129557292, "learning_rate": 0.0001, "loss": 4.4606, "loss/crossentropy": 2.098844289779663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18847080320119858, "step": 16478 }, { "epoch": 0.3296, "grad_norm": 1.890625, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 4.1978, "loss/crossentropy": 2.0402814149856567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929381936788559, "step": 16480 }, { "epoch": 0.32964, "grad_norm": 2.0625, "grad_norm_var": 0.013315582275390625, "learning_rate": 0.0001, "loss": 4.2737, "loss/crossentropy": 2.413077712059021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2426547110080719, "step": 16482 }, { "epoch": 0.32968, "grad_norm": 2.015625, "grad_norm_var": 0.013401031494140625, "learning_rate": 0.0001, "loss": 3.9544, "loss/crossentropy": 2.081916332244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190306931734085, "step": 16484 }, { "epoch": 0.32972, "grad_norm": 1.9375, "grad_norm_var": 0.010765584309895833, "learning_rate": 0.0001, "loss": 4.1093, "loss/crossentropy": 2.1017117500305176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19809769093990326, "step": 16486 }, { "epoch": 0.32976, "grad_norm": 1.8515625, "grad_norm_var": 0.008139801025390626, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 2.289687156677246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503399312496185, "step": 16488 }, { "epoch": 0.3298, "grad_norm": 2.09375, "grad_norm_var": 0.008642323811848958, "learning_rate": 0.0001, "loss": 4.1952, "loss/crossentropy": 2.0613157749176025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20428171753883362, "step": 16490 }, { "epoch": 0.32984, "grad_norm": 2.0, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 4.3191, "loss/crossentropy": 2.1484400033950806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20850211381912231, "step": 16492 }, { "epoch": 0.32988, "grad_norm": 1.8359375, "grad_norm_var": 0.0072629292805989586, "learning_rate": 0.0001, "loss": 3.875, "loss/crossentropy": 1.6853107213974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736336275935173, "step": 16494 }, { "epoch": 0.32992, "grad_norm": 1.96875, "grad_norm_var": 0.007389068603515625, "learning_rate": 0.0001, "loss": 4.2047, "loss/crossentropy": 2.323481321334839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22742202132940292, "step": 16496 }, { "epoch": 0.32996, "grad_norm": 2.078125, "grad_norm_var": 0.00738525390625, "learning_rate": 0.0001, "loss": 4.124, "loss/crossentropy": 2.1196956038475037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19792834669351578, "step": 16498 }, { "epoch": 0.33, "grad_norm": 1.9921875, "grad_norm_var": 0.007417805989583333, "learning_rate": 0.0001, "loss": 4.0013, "loss/crossentropy": 1.947241187095642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21067001670598984, "step": 16500 }, { "epoch": 0.33004, "grad_norm": 2.046875, "grad_norm_var": 0.00758056640625, "learning_rate": 0.0001, "loss": 4.0631, "loss/crossentropy": 2.256502628326416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22300700098276138, "step": 16502 }, { "epoch": 0.33008, "grad_norm": 1.8828125, "grad_norm_var": 0.007574208577473958, "learning_rate": 0.0001, "loss": 3.7874, "loss/crossentropy": 1.7947281002998352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1860191375017166, "step": 16504 }, { "epoch": 0.33012, "grad_norm": 1.921875, "grad_norm_var": 0.007995351155598959, "learning_rate": 0.0001, "loss": 4.0189, "loss/crossentropy": 1.9331985712051392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20170047879219055, "step": 16506 }, { "epoch": 0.33016, "grad_norm": 1.96875, "grad_norm_var": 0.011030832926432291, "learning_rate": 0.0001, "loss": 4.2148, "loss/crossentropy": 2.2274327278137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977931559085846, "step": 16508 }, { "epoch": 0.3302, "grad_norm": 2.015625, "grad_norm_var": 0.007884724934895834, "learning_rate": 0.0001, "loss": 4.15, "loss/crossentropy": 1.8204082250595093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18195096403360367, "step": 16510 }, { "epoch": 0.33024, "grad_norm": 1.9296875, "grad_norm_var": 0.009633127848307292, "learning_rate": 0.0001, "loss": 4.2363, "loss/crossentropy": 2.308950901031494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22067535668611526, "step": 16512 }, { "epoch": 0.33028, "grad_norm": 2.3125, "grad_norm_var": 0.014794921875, "learning_rate": 0.0001, "loss": 4.3895, "loss/crossentropy": 2.1682082414627075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2021712213754654, "step": 16514 }, { "epoch": 0.33032, "grad_norm": 1.90625, "grad_norm_var": 0.016047159830729168, "learning_rate": 0.0001, "loss": 3.9213, "loss/crossentropy": 1.9669402837753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19773626327514648, "step": 16516 }, { "epoch": 0.33036, "grad_norm": 2.046875, "grad_norm_var": 0.016047159830729168, "learning_rate": 0.0001, "loss": 4.1723, "loss/crossentropy": 2.0478034019470215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19190094619989395, "step": 16518 }, { "epoch": 0.3304, "grad_norm": 1.984375, "grad_norm_var": 0.014127349853515625, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.8262990713119507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19281092286109924, "step": 16520 }, { "epoch": 0.33044, "grad_norm": 1.9453125, "grad_norm_var": 0.0139801025390625, "learning_rate": 0.0001, "loss": 4.385, "loss/crossentropy": 2.3943055868148804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2286393940448761, "step": 16522 }, { "epoch": 0.33048, "grad_norm": 1.90625, "grad_norm_var": 0.0124664306640625, "learning_rate": 0.0001, "loss": 4.1117, "loss/crossentropy": 1.7113690972328186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20282406359910965, "step": 16524 }, { "epoch": 0.33052, "grad_norm": 1.9296875, "grad_norm_var": 0.012924957275390624, "learning_rate": 0.0001, "loss": 3.9182, "loss/crossentropy": 1.9077526926994324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18814775347709656, "step": 16526 }, { "epoch": 0.33056, "grad_norm": 2.0625, "grad_norm_var": 0.0106597900390625, "learning_rate": 0.0001, "loss": 4.3404, "loss/crossentropy": 1.9100408554077148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897171437740326, "step": 16528 }, { "epoch": 0.3306, "grad_norm": 1.84375, "grad_norm_var": 0.0066314697265625, "learning_rate": 0.0001, "loss": 4.1403, "loss/crossentropy": 1.8145674467086792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18221604079008102, "step": 16530 }, { "epoch": 0.33064, "grad_norm": 2.1875, "grad_norm_var": 0.0085113525390625, "learning_rate": 0.0001, "loss": 4.0647, "loss/crossentropy": 2.017480492591858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19173581898212433, "step": 16532 }, { "epoch": 0.33068, "grad_norm": 1.9921875, "grad_norm_var": 0.008739980061848958, "learning_rate": 0.0001, "loss": 4.1918, "loss/crossentropy": 1.8937708139419556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974407583475113, "step": 16534 }, { "epoch": 0.33072, "grad_norm": 2.203125, "grad_norm_var": 0.012664540608723959, "learning_rate": 0.0001, "loss": 4.1354, "loss/crossentropy": 1.824431598186493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18114721775054932, "step": 16536 }, { "epoch": 0.33076, "grad_norm": 2.03125, "grad_norm_var": 0.018464152018229166, "learning_rate": 0.0001, "loss": 4.034, "loss/crossentropy": 1.6330693364143372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17686167359352112, "step": 16538 }, { "epoch": 0.3308, "grad_norm": 2.453125, "grad_norm_var": 0.028595987955729166, "learning_rate": 0.0001, "loss": 4.2359, "loss/crossentropy": 2.1948810815811157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21977168321609497, "step": 16540 }, { "epoch": 0.33084, "grad_norm": 2.328125, "grad_norm_var": 0.03220926920572917, "learning_rate": 0.0001, "loss": 4.1942, "loss/crossentropy": 2.1222333908081055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19481626898050308, "step": 16542 }, { "epoch": 0.33088, "grad_norm": 2.140625, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 3.9908, "loss/crossentropy": 1.9703376293182373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912524312734604, "step": 16544 }, { "epoch": 0.33092, "grad_norm": 1.8515625, "grad_norm_var": 0.03206965128580729, "learning_rate": 0.0001, "loss": 4.0774, "loss/crossentropy": 2.2693361043930054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2137979120016098, "step": 16546 }, { "epoch": 0.33096, "grad_norm": 3.703125, "grad_norm_var": 0.19548746744791667, "learning_rate": 0.0001, "loss": 4.2372, "loss/crossentropy": 2.029613673686981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21167294681072235, "step": 16548 }, { "epoch": 0.331, "grad_norm": 1.90625, "grad_norm_var": 0.19552586873372396, "learning_rate": 0.0001, "loss": 3.7223, "loss/crossentropy": 2.0102853775024414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100009247660637, "step": 16550 }, { "epoch": 0.33104, "grad_norm": 2.234375, "grad_norm_var": 0.19806722005208333, "learning_rate": 0.0001, "loss": 4.6058, "loss/crossentropy": 2.1570287942886353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21619540452957153, "step": 16552 }, { "epoch": 0.33108, "grad_norm": 2.046875, "grad_norm_var": 0.20053609212239584, "learning_rate": 0.0001, "loss": 4.0271, "loss/crossentropy": 1.9157934188842773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193254753947258, "step": 16554 }, { "epoch": 0.33112, "grad_norm": 1.8515625, "grad_norm_var": 0.2052642822265625, "learning_rate": 0.0001, "loss": 3.6593, "loss/crossentropy": 1.7680367827415466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19283122569322586, "step": 16556 }, { "epoch": 0.33116, "grad_norm": 1.8984375, "grad_norm_var": 0.2058013916015625, "learning_rate": 0.0001, "loss": 3.8828, "loss/crossentropy": 1.8805989623069763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20299813151359558, "step": 16558 }, { "epoch": 0.3312, "grad_norm": 1.71875, "grad_norm_var": 0.21938069661458334, "learning_rate": 0.0001, "loss": 3.7657, "loss/crossentropy": 1.9145240187644958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18402665108442307, "step": 16560 }, { "epoch": 0.33124, "grad_norm": 1.9453125, "grad_norm_var": 0.2167144775390625, "learning_rate": 0.0001, "loss": 4.1984, "loss/crossentropy": 1.7958417534828186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17038261890411377, "step": 16562 }, { "epoch": 0.33128, "grad_norm": 1.8984375, "grad_norm_var": 0.038919830322265626, "learning_rate": 0.0001, "loss": 3.8916, "loss/crossentropy": 2.0782148838043213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21738532930612564, "step": 16564 }, { "epoch": 0.33132, "grad_norm": 2.046875, "grad_norm_var": 0.03910903930664063, "learning_rate": 0.0001, "loss": 4.3098, "loss/crossentropy": 1.9767170548439026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127690687775612, "step": 16566 }, { "epoch": 0.33136, "grad_norm": 1.9296875, "grad_norm_var": 0.01185302734375, "learning_rate": 0.0001, "loss": 3.9133, "loss/crossentropy": 2.086383044719696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19811449944972992, "step": 16568 }, { "epoch": 0.3314, "grad_norm": 1.9453125, "grad_norm_var": 0.011991119384765625, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 2.0471088886260986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20480120182037354, "step": 16570 }, { "epoch": 0.33144, "grad_norm": 2.109375, "grad_norm_var": 0.012485504150390625, "learning_rate": 0.0001, "loss": 4.1097, "loss/crossentropy": 2.2637689113616943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21112071722745895, "step": 16572 }, { "epoch": 0.33148, "grad_norm": 2.015625, "grad_norm_var": 0.012018839518229166, "learning_rate": 0.0001, "loss": 4.023, "loss/crossentropy": 1.8035425543785095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18357180804014206, "step": 16574 }, { "epoch": 0.33152, "grad_norm": 1.953125, "grad_norm_var": 0.007189687093098958, "learning_rate": 0.0001, "loss": 3.7453, "loss/crossentropy": 1.647614598274231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17512448877096176, "step": 16576 }, { "epoch": 0.33156, "grad_norm": 2.078125, "grad_norm_var": 0.0063873291015625, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 2.0647078156471252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21568534523248672, "step": 16578 }, { "epoch": 0.3316, "grad_norm": 1.9296875, "grad_norm_var": 0.00615234375, "learning_rate": 0.0001, "loss": 3.8656, "loss/crossentropy": 1.7758954763412476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891157627105713, "step": 16580 }, { "epoch": 0.33164, "grad_norm": 2.125, "grad_norm_var": 0.0060618082682291664, "learning_rate": 0.0001, "loss": 4.1022, "loss/crossentropy": 2.0523850321769714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21198877692222595, "step": 16582 }, { "epoch": 0.33168, "grad_norm": 2.078125, "grad_norm_var": 0.006725819905598959, "learning_rate": 0.0001, "loss": 4.0584, "loss/crossentropy": 1.99459570646286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20555292069911957, "step": 16584 }, { "epoch": 0.33172, "grad_norm": 2.046875, "grad_norm_var": 0.007771809895833333, "learning_rate": 0.0001, "loss": 4.4979, "loss/crossentropy": 2.353522777557373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23193839937448502, "step": 16586 }, { "epoch": 0.33176, "grad_norm": 2.078125, "grad_norm_var": 0.020970662434895832, "learning_rate": 0.0001, "loss": 4.1303, "loss/crossentropy": 2.0257768630981445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20071294903755188, "step": 16588 }, { "epoch": 0.3318, "grad_norm": 2.046875, "grad_norm_var": 0.020467122395833332, "learning_rate": 0.0001, "loss": 4.2135, "loss/crossentropy": 2.2084985971450806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20876871049404144, "step": 16590 }, { "epoch": 0.33184, "grad_norm": 2.15625, "grad_norm_var": 0.019791412353515624, "learning_rate": 0.0001, "loss": 4.3275, "loss/crossentropy": 1.9118791818618774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24547216296195984, "step": 16592 }, { "epoch": 0.33188, "grad_norm": 1.90625, "grad_norm_var": 0.022188313802083335, "learning_rate": 0.0001, "loss": 4.0069, "loss/crossentropy": 2.184453248977661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1988314613699913, "step": 16594 }, { "epoch": 0.33192, "grad_norm": 1.71875, "grad_norm_var": 0.02801488240559896, "learning_rate": 0.0001, "loss": 3.8711, "loss/crossentropy": 2.2105953097343445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20877376198768616, "step": 16596 }, { "epoch": 0.33196, "grad_norm": 2.078125, "grad_norm_var": 0.028586578369140626, "learning_rate": 0.0001, "loss": 4.2969, "loss/crossentropy": 2.028349459171295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20705370604991913, "step": 16598 }, { "epoch": 0.332, "grad_norm": 2.015625, "grad_norm_var": 0.02982177734375, "learning_rate": 0.0001, "loss": 3.8968, "loss/crossentropy": 2.0778703689575195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19761346280574799, "step": 16600 }, { "epoch": 0.33204, "grad_norm": 1.8828125, "grad_norm_var": 0.029670206705729167, "learning_rate": 0.0001, "loss": 3.9786, "loss/crossentropy": 2.046931743621826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21325716376304626, "step": 16602 }, { "epoch": 0.33208, "grad_norm": 1.9296875, "grad_norm_var": 0.011486562093098958, "learning_rate": 0.0001, "loss": 4.1525, "loss/crossentropy": 1.7907955050468445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822371482849121, "step": 16604 }, { "epoch": 0.33212, "grad_norm": 2.09375, "grad_norm_var": 0.012108357747395833, "learning_rate": 0.0001, "loss": 4.1586, "loss/crossentropy": 1.7620025277137756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19196610152721405, "step": 16606 }, { "epoch": 0.33216, "grad_norm": 2.015625, "grad_norm_var": 0.010282135009765625, "learning_rate": 0.0001, "loss": 3.9175, "loss/crossentropy": 1.6695470213890076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16451346129179, "step": 16608 }, { "epoch": 0.3322, "grad_norm": 2.09375, "grad_norm_var": 0.012717437744140626, "learning_rate": 0.0001, "loss": 4.0304, "loss/crossentropy": 2.2249897718429565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21898073703050613, "step": 16610 }, { "epoch": 0.33224, "grad_norm": 1.8984375, "grad_norm_var": 0.009474436442057291, "learning_rate": 0.0001, "loss": 4.1589, "loss/crossentropy": 2.0861976146698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993977203965187, "step": 16612 }, { "epoch": 0.33228, "grad_norm": 1.859375, "grad_norm_var": 0.009527333577473958, "learning_rate": 0.0001, "loss": 4.1516, "loss/crossentropy": 2.195378541946411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103184312582016, "step": 16614 }, { "epoch": 0.33232, "grad_norm": 1.9609375, "grad_norm_var": 0.009478505452473958, "learning_rate": 0.0001, "loss": 3.6947, "loss/crossentropy": 1.9064915180206299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17310689389705658, "step": 16616 }, { "epoch": 0.33236, "grad_norm": 1.9921875, "grad_norm_var": 0.009452311197916667, "learning_rate": 0.0001, "loss": 3.9698, "loss/crossentropy": 1.7967488169670105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18099892884492874, "step": 16618 }, { "epoch": 0.3324, "grad_norm": 2.140625, "grad_norm_var": 0.012565104166666667, "learning_rate": 0.0001, "loss": 4.1319, "loss/crossentropy": 2.0776742696762085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005431056022644, "step": 16620 }, { "epoch": 0.33244, "grad_norm": 1.90625, "grad_norm_var": 0.0109375, "learning_rate": 0.0001, "loss": 4.1881, "loss/crossentropy": 2.0590518712997437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20676207542419434, "step": 16622 }, { "epoch": 0.33248, "grad_norm": 2.0, "grad_norm_var": 0.011628977457682292, "learning_rate": 0.0001, "loss": 4.2067, "loss/crossentropy": 1.8861089944839478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1860756129026413, "step": 16624 }, { "epoch": 0.33252, "grad_norm": 1.9453125, "grad_norm_var": 0.009308878580729167, "learning_rate": 0.0001, "loss": 3.9757, "loss/crossentropy": 2.4207329750061035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22214040160179138, "step": 16626 }, { "epoch": 0.33256, "grad_norm": 1.8671875, "grad_norm_var": 0.009295399983723958, "learning_rate": 0.0001, "loss": 3.934, "loss/crossentropy": 2.089089274406433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21007104218006134, "step": 16628 }, { "epoch": 0.3326, "grad_norm": 1.984375, "grad_norm_var": 0.011380767822265625, "learning_rate": 0.0001, "loss": 4.3451, "loss/crossentropy": 2.357658624649048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22044895589351654, "step": 16630 }, { "epoch": 0.33264, "grad_norm": 1.875, "grad_norm_var": 0.010223134358723959, "learning_rate": 0.0001, "loss": 3.9142, "loss/crossentropy": 2.1036278009414673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003040835261345, "step": 16632 }, { "epoch": 0.33268, "grad_norm": 1.859375, "grad_norm_var": 0.010749308268229167, "learning_rate": 0.0001, "loss": 4.0802, "loss/crossentropy": 2.202688694000244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21549009531736374, "step": 16634 }, { "epoch": 0.33272, "grad_norm": 2.078125, "grad_norm_var": 0.008642323811848958, "learning_rate": 0.0001, "loss": 4.1642, "loss/crossentropy": 2.261624753475189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209767997264862, "step": 16636 }, { "epoch": 0.33276, "grad_norm": 2.234375, "grad_norm_var": 0.013570149739583334, "learning_rate": 0.0001, "loss": 4.0556, "loss/crossentropy": 1.7452040910720825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18558355420827866, "step": 16638 }, { "epoch": 0.3328, "grad_norm": 1.9609375, "grad_norm_var": 0.012797037760416666, "learning_rate": 0.0001, "loss": 3.9364, "loss/crossentropy": 1.9910151362419128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115819901227951, "step": 16640 }, { "epoch": 0.33284, "grad_norm": 2.015625, "grad_norm_var": 0.014583079020182292, "learning_rate": 0.0001, "loss": 3.9412, "loss/crossentropy": 2.0230116844177246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001063972711563, "step": 16642 }, { "epoch": 0.33288, "grad_norm": 1.8359375, "grad_norm_var": 0.015818023681640626, "learning_rate": 0.0001, "loss": 4.0476, "loss/crossentropy": 1.9575697183609009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20243427157402039, "step": 16644 }, { "epoch": 0.33292, "grad_norm": 1.96875, "grad_norm_var": 0.012890370686848958, "learning_rate": 0.0001, "loss": 4.052, "loss/crossentropy": 1.9650630354881287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203716441988945, "step": 16646 }, { "epoch": 0.33296, "grad_norm": 1.859375, "grad_norm_var": 0.0132476806640625, "learning_rate": 0.0001, "loss": 4.1345, "loss/crossentropy": 1.497282326221466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15893913060426712, "step": 16648 }, { "epoch": 0.333, "grad_norm": 1.9765625, "grad_norm_var": 0.014491526285807292, "learning_rate": 0.0001, "loss": 4.1273, "loss/crossentropy": 1.8017843961715698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18981081247329712, "step": 16650 }, { "epoch": 0.33304, "grad_norm": 2.125, "grad_norm_var": 0.014296213785807291, "learning_rate": 0.0001, "loss": 4.07, "loss/crossentropy": 2.089366614818573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076069712638855, "step": 16652 }, { "epoch": 0.33308, "grad_norm": 1.8515625, "grad_norm_var": 0.008695475260416667, "learning_rate": 0.0001, "loss": 3.9662, "loss/crossentropy": 1.9445012211799622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18241490423679352, "step": 16654 }, { "epoch": 0.33312, "grad_norm": 1.90625, "grad_norm_var": 0.008740234375, "learning_rate": 0.0001, "loss": 3.9922, "loss/crossentropy": 1.844423532485962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18664997816085815, "step": 16656 }, { "epoch": 0.33316, "grad_norm": 1.9765625, "grad_norm_var": 0.007085927327473958, "learning_rate": 0.0001, "loss": 4.2221, "loss/crossentropy": 2.1095730662345886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21053151786327362, "step": 16658 }, { "epoch": 0.3332, "grad_norm": 2.03125, "grad_norm_var": 0.007477823893229167, "learning_rate": 0.0001, "loss": 4.255, "loss/crossentropy": 2.139304041862488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19131766259670258, "step": 16660 }, { "epoch": 0.33324, "grad_norm": 2.0, "grad_norm_var": 0.007450103759765625, "learning_rate": 0.0001, "loss": 3.9358, "loss/crossentropy": 1.8872849345207214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19200573861598969, "step": 16662 }, { "epoch": 0.33328, "grad_norm": 1.984375, "grad_norm_var": 0.006695302327473959, "learning_rate": 0.0001, "loss": 4.0939, "loss/crossentropy": 2.488931655883789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24411997199058533, "step": 16664 }, { "epoch": 0.33332, "grad_norm": 1.9609375, "grad_norm_var": 0.006151326497395833, "learning_rate": 0.0001, "loss": 3.7294, "loss/crossentropy": 1.8066997528076172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19391655176877975, "step": 16666 }, { "epoch": 0.33336, "grad_norm": 2.0, "grad_norm_var": 0.005370076497395833, "learning_rate": 0.0001, "loss": 4.1831, "loss/crossentropy": 2.085490345954895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21987342834472656, "step": 16668 }, { "epoch": 0.3334, "grad_norm": 1.9765625, "grad_norm_var": 0.006459299723307292, "learning_rate": 0.0001, "loss": 3.7791, "loss/crossentropy": 2.052145302295685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21673783659934998, "step": 16670 }, { "epoch": 0.33344, "grad_norm": 1.9140625, "grad_norm_var": 0.00640869140625, "learning_rate": 0.0001, "loss": 4.1648, "loss/crossentropy": 2.0924419164657593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21523578464984894, "step": 16672 }, { "epoch": 0.33348, "grad_norm": 1.96875, "grad_norm_var": 0.006525675455729167, "learning_rate": 0.0001, "loss": 4.125, "loss/crossentropy": 2.0340868830680847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1785074919462204, "step": 16674 }, { "epoch": 0.33352, "grad_norm": 2.25, "grad_norm_var": 0.010990397135416666, "learning_rate": 0.0001, "loss": 4.2404, "loss/crossentropy": 2.4421908855438232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24263421446084976, "step": 16676 }, { "epoch": 0.33356, "grad_norm": 1.953125, "grad_norm_var": 0.0108551025390625, "learning_rate": 0.0001, "loss": 4.1879, "loss/crossentropy": 1.9141955971717834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18666712939739227, "step": 16678 }, { "epoch": 0.3336, "grad_norm": 2.015625, "grad_norm_var": 0.011336008707682291, "learning_rate": 0.0001, "loss": 4.0378, "loss/crossentropy": 2.0019638538360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17722105979919434, "step": 16680 }, { "epoch": 0.33364, "grad_norm": 1.984375, "grad_norm_var": 0.009511057535807292, "learning_rate": 0.0001, "loss": 4.2047, "loss/crossentropy": 2.0835896134376526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251879021525383, "step": 16682 }, { "epoch": 0.33368, "grad_norm": 1.9609375, "grad_norm_var": 0.015192667643229166, "learning_rate": 0.0001, "loss": 4.2406, "loss/crossentropy": 1.8940032720565796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19034714996814728, "step": 16684 }, { "epoch": 0.33372, "grad_norm": 2.09375, "grad_norm_var": 0.014817047119140624, "learning_rate": 0.0001, "loss": 4.0845, "loss/crossentropy": 1.863099992275238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19675587117671967, "step": 16686 }, { "epoch": 0.33376, "grad_norm": 2.046875, "grad_norm_var": 0.024836222330729168, "learning_rate": 0.0001, "loss": 4.2785, "loss/crossentropy": 2.027459740638733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19650273770093918, "step": 16688 }, { "epoch": 0.3338, "grad_norm": 1.890625, "grad_norm_var": 0.026008097330729167, "learning_rate": 0.0001, "loss": 4.2892, "loss/crossentropy": 2.2324944734573364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23016268759965897, "step": 16690 }, { "epoch": 0.33384, "grad_norm": 2.03125, "grad_norm_var": 0.02474950154622396, "learning_rate": 0.0001, "loss": 4.155, "loss/crossentropy": 2.213741898536682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21159538626670837, "step": 16692 }, { "epoch": 0.33388, "grad_norm": 2.03125, "grad_norm_var": 0.025340779622395834, "learning_rate": 0.0001, "loss": 4.4426, "loss/crossentropy": 2.084823966026306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22060541808605194, "step": 16694 }, { "epoch": 0.33392, "grad_norm": 1.9765625, "grad_norm_var": 0.02417780558268229, "learning_rate": 0.0001, "loss": 4.1847, "loss/crossentropy": 2.2417017221450806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050381377339363, "step": 16696 }, { "epoch": 0.33396, "grad_norm": 2.015625, "grad_norm_var": 0.032134755452473955, "learning_rate": 0.0001, "loss": 4.3037, "loss/crossentropy": 2.3696242570877075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22559890151023865, "step": 16698 }, { "epoch": 0.334, "grad_norm": 1.8828125, "grad_norm_var": 0.030295562744140626, "learning_rate": 0.0001, "loss": 4.158, "loss/crossentropy": 2.0899609327316284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20470503717660904, "step": 16700 }, { "epoch": 0.33404, "grad_norm": 1.90625, "grad_norm_var": 0.029386138916015624, "learning_rate": 0.0001, "loss": 3.924, "loss/crossentropy": 1.8663234114646912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815531924366951, "step": 16702 }, { "epoch": 0.33408, "grad_norm": 3.0, "grad_norm_var": 0.08069432576497396, "learning_rate": 0.0001, "loss": 4.1651, "loss/crossentropy": 2.030432403087616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063748836517334, "step": 16704 }, { "epoch": 0.33412, "grad_norm": 1.90625, "grad_norm_var": 0.08033218383789062, "learning_rate": 0.0001, "loss": 4.152, "loss/crossentropy": 2.12885981798172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19536730647087097, "step": 16706 }, { "epoch": 0.33416, "grad_norm": 2.09375, "grad_norm_var": 0.07674153645833333, "learning_rate": 0.0001, "loss": 3.909, "loss/crossentropy": 2.0870607495307922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20478558540344238, "step": 16708 }, { "epoch": 0.3342, "grad_norm": 2.03125, "grad_norm_var": 0.07646382649739583, "learning_rate": 0.0001, "loss": 4.2058, "loss/crossentropy": 2.215463638305664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118721902370453, "step": 16710 }, { "epoch": 0.33424, "grad_norm": 1.9609375, "grad_norm_var": 0.08145243326822917, "learning_rate": 0.0001, "loss": 3.8417, "loss/crossentropy": 2.0292049646377563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18440847843885422, "step": 16712 }, { "epoch": 0.33428, "grad_norm": 1.828125, "grad_norm_var": 0.08017756144205729, "learning_rate": 0.0001, "loss": 3.7722, "loss/crossentropy": 1.9214341640472412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18510671705007553, "step": 16714 }, { "epoch": 0.33432, "grad_norm": 2.046875, "grad_norm_var": 0.07769266764322917, "learning_rate": 0.0001, "loss": 4.3905, "loss/crossentropy": 2.1761534214019775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20527766644954681, "step": 16716 }, { "epoch": 0.33436, "grad_norm": 1.9765625, "grad_norm_var": 0.07639567057291667, "learning_rate": 0.0001, "loss": 3.9234, "loss/crossentropy": 2.020824670791626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20723462104797363, "step": 16718 }, { "epoch": 0.3344, "grad_norm": 1.921875, "grad_norm_var": 0.0129547119140625, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 2.243465781211853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21866197139024734, "step": 16720 }, { "epoch": 0.33444, "grad_norm": 1.859375, "grad_norm_var": 0.013244374593098959, "learning_rate": 0.0001, "loss": 3.9963, "loss/crossentropy": 1.9071390628814697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20368033647537231, "step": 16722 }, { "epoch": 0.33448, "grad_norm": 1.7890625, "grad_norm_var": 0.011966959635416666, "learning_rate": 0.0001, "loss": 3.8974, "loss/crossentropy": 1.72525554895401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18200145661830902, "step": 16724 }, { "epoch": 0.33452, "grad_norm": 2.109375, "grad_norm_var": 0.0154052734375, "learning_rate": 0.0001, "loss": 4.101, "loss/crossentropy": 2.174055576324463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181844174861908, "step": 16726 }, { "epoch": 0.33456, "grad_norm": 1.8984375, "grad_norm_var": 0.014251454671223959, "learning_rate": 0.0001, "loss": 3.8004, "loss/crossentropy": 1.6175724864006042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15338046848773956, "step": 16728 }, { "epoch": 0.3346, "grad_norm": 1.953125, "grad_norm_var": 0.012353515625, "learning_rate": 0.0001, "loss": 4.1101, "loss/crossentropy": 2.0030194520950317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20353461802005768, "step": 16730 }, { "epoch": 0.33464, "grad_norm": 2.03125, "grad_norm_var": 0.012092081705729167, "learning_rate": 0.0001, "loss": 4.3876, "loss/crossentropy": 2.272015690803528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22368235886096954, "step": 16732 }, { "epoch": 0.33468, "grad_norm": 2.046875, "grad_norm_var": 0.012189737955729167, "learning_rate": 0.0001, "loss": 4.3365, "loss/crossentropy": 2.149975538253784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20634270459413528, "step": 16734 }, { "epoch": 0.33472, "grad_norm": 1.921875, "grad_norm_var": 0.009869130452473958, "learning_rate": 0.0001, "loss": 3.9201, "loss/crossentropy": 2.095462441444397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20027077198028564, "step": 16736 }, { "epoch": 0.33476, "grad_norm": 1.953125, "grad_norm_var": 0.008819325764973959, "learning_rate": 0.0001, "loss": 4.0859, "loss/crossentropy": 1.988598644733429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20503661036491394, "step": 16738 }, { "epoch": 0.3348, "grad_norm": 2.03125, "grad_norm_var": 0.0066569010416666664, "learning_rate": 0.0001, "loss": 4.2618, "loss/crossentropy": 2.0615572333335876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20363956689834595, "step": 16740 }, { "epoch": 0.33484, "grad_norm": 2.0625, "grad_norm_var": 0.0027903238932291668, "learning_rate": 0.0001, "loss": 3.9476, "loss/crossentropy": 1.8172455430030823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20091129839420319, "step": 16742 }, { "epoch": 0.33488, "grad_norm": 1.984375, "grad_norm_var": 0.0043853759765625, "learning_rate": 0.0001, "loss": 3.9541, "loss/crossentropy": 2.212782144546509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19710610061883926, "step": 16744 }, { "epoch": 0.33492, "grad_norm": 1.9453125, "grad_norm_var": 0.008674875895182291, "learning_rate": 0.0001, "loss": 4.2629, "loss/crossentropy": 2.374183773994446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23687051236629486, "step": 16746 }, { "epoch": 0.33496, "grad_norm": 2.0625, "grad_norm_var": 0.008902740478515626, "learning_rate": 0.0001, "loss": 4.3642, "loss/crossentropy": 1.9374622702598572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19821622967720032, "step": 16748 }, { "epoch": 0.335, "grad_norm": 1.984375, "grad_norm_var": 0.008740234375, "learning_rate": 0.0001, "loss": 4.1289, "loss/crossentropy": 1.6063715815544128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16296840459108353, "step": 16750 }, { "epoch": 0.33504, "grad_norm": 1.859375, "grad_norm_var": 0.0096832275390625, "learning_rate": 0.0001, "loss": 4.0315, "loss/crossentropy": 2.1251469254493713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17997874319553375, "step": 16752 }, { "epoch": 0.33508, "grad_norm": 1.9140625, "grad_norm_var": 0.0097564697265625, "learning_rate": 0.0001, "loss": 4.03, "loss/crossentropy": 2.126043915748596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2283206582069397, "step": 16754 }, { "epoch": 0.33512, "grad_norm": 2.046875, "grad_norm_var": 0.009901682535807291, "learning_rate": 0.0001, "loss": 4.2969, "loss/crossentropy": 2.3409340381622314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21817704290151596, "step": 16756 }, { "epoch": 0.33516, "grad_norm": 2.0, "grad_norm_var": 0.01024169921875, "learning_rate": 0.0001, "loss": 4.0083, "loss/crossentropy": 1.9816790223121643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19743956625461578, "step": 16758 }, { "epoch": 0.3352, "grad_norm": 2.21875, "grad_norm_var": 0.012967681884765625, "learning_rate": 0.0001, "loss": 4.3056, "loss/crossentropy": 2.246092438697815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22662658989429474, "step": 16760 }, { "epoch": 0.33524, "grad_norm": 2.125, "grad_norm_var": 0.010794830322265626, "learning_rate": 0.0001, "loss": 4.2842, "loss/crossentropy": 2.1571802496910095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20506682991981506, "step": 16762 }, { "epoch": 0.33528, "grad_norm": 2.015625, "grad_norm_var": 0.010636138916015624, "learning_rate": 0.0001, "loss": 4.0372, "loss/crossentropy": 1.966954231262207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18581678718328476, "step": 16764 }, { "epoch": 0.33532, "grad_norm": 2.125, "grad_norm_var": 0.012961578369140626, "learning_rate": 0.0001, "loss": 3.9978, "loss/crossentropy": 1.9360153079032898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20846693962812424, "step": 16766 }, { "epoch": 0.33536, "grad_norm": 2.0625, "grad_norm_var": 0.012644195556640625, "learning_rate": 0.0001, "loss": 4.0953, "loss/crossentropy": 2.3978073596954346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20406542718410492, "step": 16768 }, { "epoch": 0.3354, "grad_norm": 2.109375, "grad_norm_var": 0.013639068603515625, "learning_rate": 0.0001, "loss": 4.0933, "loss/crossentropy": 2.239229917526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21536701172590256, "step": 16770 }, { "epoch": 0.33544, "grad_norm": 1.9765625, "grad_norm_var": 0.013600413004557292, "learning_rate": 0.0001, "loss": 4.2153, "loss/crossentropy": 2.1333428621292114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062046378850937, "step": 16772 }, { "epoch": 0.33548, "grad_norm": 2.09375, "grad_norm_var": 0.014362589518229166, "learning_rate": 0.0001, "loss": 4.0094, "loss/crossentropy": 2.3503127098083496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20422626286745071, "step": 16774 }, { "epoch": 0.33552, "grad_norm": 1.9140625, "grad_norm_var": 0.009772745768229167, "learning_rate": 0.0001, "loss": 4.1466, "loss/crossentropy": 1.9181209802627563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16844520717859268, "step": 16776 }, { "epoch": 0.33556, "grad_norm": 2.015625, "grad_norm_var": 0.009269205729166667, "learning_rate": 0.0001, "loss": 4.1975, "loss/crossentropy": 2.022172212600708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19443488866090775, "step": 16778 }, { "epoch": 0.3356, "grad_norm": 1.9765625, "grad_norm_var": 0.010951487223307292, "learning_rate": 0.0001, "loss": 3.9544, "loss/crossentropy": 2.0416210293769836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17731749266386032, "step": 16780 }, { "epoch": 0.33564, "grad_norm": 2.0625, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 4.0757, "loss/crossentropy": 2.1722596883773804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19995421171188354, "step": 16782 }, { "epoch": 0.33568, "grad_norm": 1.953125, "grad_norm_var": 0.009146881103515626, "learning_rate": 0.0001, "loss": 4.0942, "loss/crossentropy": 2.4041545391082764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22784388065338135, "step": 16784 }, { "epoch": 0.33572, "grad_norm": 1.9453125, "grad_norm_var": 0.007757314046223958, "learning_rate": 0.0001, "loss": 4.0692, "loss/crossentropy": 2.189204216003418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20079431682825089, "step": 16786 }, { "epoch": 0.33576, "grad_norm": 1.890625, "grad_norm_var": 0.007130686442057292, "learning_rate": 0.0001, "loss": 3.9771, "loss/crossentropy": 2.2308130860328674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22352158278226852, "step": 16788 }, { "epoch": 0.3358, "grad_norm": 1.9765625, "grad_norm_var": 0.005248006184895833, "learning_rate": 0.0001, "loss": 4.0901, "loss/crossentropy": 2.238118886947632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21165399253368378, "step": 16790 }, { "epoch": 0.33584, "grad_norm": 11.9375, "grad_norm_var": 6.248060862223308, "learning_rate": 0.0001, "loss": 4.5633, "loss/crossentropy": 2.452346444129944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192159816622734, "step": 16792 }, { "epoch": 0.33588, "grad_norm": 1.9765625, "grad_norm_var": 6.234175364176433, "learning_rate": 0.0001, "loss": 4.2032, "loss/crossentropy": 2.115506410598755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19228096306324005, "step": 16794 }, { "epoch": 0.33592, "grad_norm": 2.046875, "grad_norm_var": 6.207706705729167, "learning_rate": 0.0001, "loss": 4.313, "loss/crossentropy": 2.1052395701408386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24314512312412262, "step": 16796 }, { "epoch": 0.33596, "grad_norm": 1.9765625, "grad_norm_var": 6.191576131184896, "learning_rate": 0.0001, "loss": 4.201, "loss/crossentropy": 1.7643597722053528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17998731136322021, "step": 16798 }, { "epoch": 0.336, "grad_norm": 1.890625, "grad_norm_var": 6.213846588134766, "learning_rate": 0.0001, "loss": 3.886, "loss/crossentropy": 2.0926302671432495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199835367500782, "step": 16800 }, { "epoch": 0.33604, "grad_norm": 2.109375, "grad_norm_var": 6.19762954711914, "learning_rate": 0.0001, "loss": 4.0106, "loss/crossentropy": 1.9939849972724915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19728610664606094, "step": 16802 }, { "epoch": 0.33608, "grad_norm": 2.15625, "grad_norm_var": 6.1746826171875, "learning_rate": 0.0001, "loss": 4.3381, "loss/crossentropy": 2.154136300086975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071739211678505, "step": 16804 }, { "epoch": 0.33612, "grad_norm": 1.9296875, "grad_norm_var": 6.173281860351563, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 1.6244451403617859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1825866624712944, "step": 16806 }, { "epoch": 0.33616, "grad_norm": 1.890625, "grad_norm_var": 0.010945383707682292, "learning_rate": 0.0001, "loss": 3.6796, "loss/crossentropy": 1.936405599117279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20031629502773285, "step": 16808 }, { "epoch": 0.3362, "grad_norm": 2.046875, "grad_norm_var": 0.011568196614583333, "learning_rate": 0.0001, "loss": 4.276, "loss/crossentropy": 2.117881119251251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21941083669662476, "step": 16810 }, { "epoch": 0.33624, "grad_norm": 1.921875, "grad_norm_var": 0.0125152587890625, "learning_rate": 0.0001, "loss": 4.1104, "loss/crossentropy": 2.0384327173233032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019578069448471, "step": 16812 }, { "epoch": 0.33628, "grad_norm": 1.921875, "grad_norm_var": 0.011893463134765626, "learning_rate": 0.0001, "loss": 4.1383, "loss/crossentropy": 2.058988571166992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20959977805614471, "step": 16814 }, { "epoch": 0.33632, "grad_norm": 1.9296875, "grad_norm_var": 0.010680898030598959, "learning_rate": 0.0001, "loss": 4.0734, "loss/crossentropy": 2.117241382598877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039024829864502, "step": 16816 }, { "epoch": 0.33636, "grad_norm": 2.0, "grad_norm_var": 0.009968058268229166, "learning_rate": 0.0001, "loss": 4.157, "loss/crossentropy": 1.8142318725585938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877349317073822, "step": 16818 }, { "epoch": 0.3364, "grad_norm": 2.296875, "grad_norm_var": 0.014440663655598958, "learning_rate": 0.0001, "loss": 4.2402, "loss/crossentropy": 2.1766676902770996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21788865327835083, "step": 16820 }, { "epoch": 0.33644, "grad_norm": 2.078125, "grad_norm_var": 0.0163726806640625, "learning_rate": 0.0001, "loss": 4.2953, "loss/crossentropy": 2.2596739530563354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23641519993543625, "step": 16822 }, { "epoch": 0.33648, "grad_norm": 1.984375, "grad_norm_var": 0.012572987874348959, "learning_rate": 0.0001, "loss": 4.1689, "loss/crossentropy": 2.1782987117767334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20562626421451569, "step": 16824 }, { "epoch": 0.33652, "grad_norm": 2.28125, "grad_norm_var": 0.01633275349934896, "learning_rate": 0.0001, "loss": 4.0693, "loss/crossentropy": 1.9579410552978516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20332375168800354, "step": 16826 }, { "epoch": 0.33656, "grad_norm": 1.75, "grad_norm_var": 0.020977528889973958, "learning_rate": 0.0001, "loss": 4.0193, "loss/crossentropy": 1.917382538318634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188996322453022, "step": 16828 }, { "epoch": 0.3366, "grad_norm": 2.125, "grad_norm_var": 0.02045466105143229, "learning_rate": 0.0001, "loss": 3.9885, "loss/crossentropy": 1.8045424222946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18587132543325424, "step": 16830 }, { "epoch": 0.33664, "grad_norm": 2.046875, "grad_norm_var": 0.07043355305989583, "learning_rate": 0.0001, "loss": 4.0171, "loss/crossentropy": 2.113844871520996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20383527874946594, "step": 16832 }, { "epoch": 0.33668, "grad_norm": 2.0625, "grad_norm_var": 0.06795654296875, "learning_rate": 0.0001, "loss": 4.1255, "loss/crossentropy": 1.9635592699050903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20447230339050293, "step": 16834 }, { "epoch": 0.33672, "grad_norm": 2.046875, "grad_norm_var": 0.06646219889322917, "learning_rate": 0.0001, "loss": 4.1336, "loss/crossentropy": 1.8918602466583252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21786046028137207, "step": 16836 }, { "epoch": 0.33676, "grad_norm": 1.9453125, "grad_norm_var": 0.06734390258789062, "learning_rate": 0.0001, "loss": 4.3372, "loss/crossentropy": 2.347909450531006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23047995567321777, "step": 16838 }, { "epoch": 0.3368, "grad_norm": 1.90625, "grad_norm_var": 0.06854248046875, "learning_rate": 0.0001, "loss": 4.0242, "loss/crossentropy": 1.834633469581604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915532797574997, "step": 16840 }, { "epoch": 0.33684, "grad_norm": 1.84375, "grad_norm_var": 0.06948954264322917, "learning_rate": 0.0001, "loss": 3.6871, "loss/crossentropy": 1.9172112345695496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18901804834604263, "step": 16842 }, { "epoch": 0.33688, "grad_norm": 1.953125, "grad_norm_var": 0.06441141764322916, "learning_rate": 0.0001, "loss": 4.3911, "loss/crossentropy": 2.37927508354187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328021451830864, "step": 16844 }, { "epoch": 0.33692, "grad_norm": 1.921875, "grad_norm_var": 0.06558024088541667, "learning_rate": 0.0001, "loss": 3.8573, "loss/crossentropy": 1.7887099385261536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17989587783813477, "step": 16846 }, { "epoch": 0.33696, "grad_norm": 1.96875, "grad_norm_var": 0.0051513671875, "learning_rate": 0.0001, "loss": 4.0826, "loss/crossentropy": 2.008473217487335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20677915215492249, "step": 16848 }, { "epoch": 0.337, "grad_norm": 1.90625, "grad_norm_var": 0.005362955729166666, "learning_rate": 0.0001, "loss": 4.1751, "loss/crossentropy": 2.101797103881836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19833290576934814, "step": 16850 }, { "epoch": 0.33704, "grad_norm": 2.0, "grad_norm_var": 0.005779774983723959, "learning_rate": 0.0001, "loss": 4.0087, "loss/crossentropy": 1.9306662678718567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893247440457344, "step": 16852 }, { "epoch": 0.33708, "grad_norm": 1.9921875, "grad_norm_var": 0.005293782552083333, "learning_rate": 0.0001, "loss": 3.9986, "loss/crossentropy": 2.1301704049110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975892335176468, "step": 16854 }, { "epoch": 0.33712, "grad_norm": 2.09375, "grad_norm_var": 0.007173411051432292, "learning_rate": 0.0001, "loss": 3.9583, "loss/crossentropy": 2.020972192287445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19376973807811737, "step": 16856 }, { "epoch": 0.33716, "grad_norm": 1.9921875, "grad_norm_var": 0.006159464518229167, "learning_rate": 0.0001, "loss": 3.9632, "loss/crossentropy": 1.8190429210662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1629476398229599, "step": 16858 }, { "epoch": 0.3372, "grad_norm": 2.09375, "grad_norm_var": 0.007183583577473959, "learning_rate": 0.0001, "loss": 4.2147, "loss/crossentropy": 2.360092878341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22874057292938232, "step": 16860 }, { "epoch": 0.33724, "grad_norm": 2.171875, "grad_norm_var": 0.008853912353515625, "learning_rate": 0.0001, "loss": 4.232, "loss/crossentropy": 2.3477792739868164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215813398361206, "step": 16862 }, { "epoch": 0.33728, "grad_norm": 1.8828125, "grad_norm_var": 0.009919230143229167, "learning_rate": 0.0001, "loss": 4.2551, "loss/crossentropy": 2.474452257156372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2185184359550476, "step": 16864 }, { "epoch": 0.33732, "grad_norm": 2.015625, "grad_norm_var": 0.010396067301432292, "learning_rate": 0.0001, "loss": 3.7782, "loss/crossentropy": 1.8387269973754883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18137617409229279, "step": 16866 }, { "epoch": 0.33736, "grad_norm": 1.953125, "grad_norm_var": 0.009924062093098958, "learning_rate": 0.0001, "loss": 3.9414, "loss/crossentropy": 1.8537965416908264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19480426609516144, "step": 16868 }, { "epoch": 0.3374, "grad_norm": 1.96875, "grad_norm_var": 0.010057576497395833, "learning_rate": 0.0001, "loss": 3.9808, "loss/crossentropy": 1.9274648427963257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19545693695545197, "step": 16870 }, { "epoch": 0.33744, "grad_norm": 2.140625, "grad_norm_var": 0.010273996988932292, "learning_rate": 0.0001, "loss": 4.1097, "loss/crossentropy": 1.9180024862289429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180539146065712, "step": 16872 }, { "epoch": 0.33748, "grad_norm": 2.078125, "grad_norm_var": 0.011016591389973959, "learning_rate": 0.0001, "loss": 4.4008, "loss/crossentropy": 2.185365915298462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2225860357284546, "step": 16874 }, { "epoch": 0.33752, "grad_norm": 1.859375, "grad_norm_var": 0.011523183186848958, "learning_rate": 0.0001, "loss": 3.9882, "loss/crossentropy": 1.9749634861946106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19897626340389252, "step": 16876 }, { "epoch": 0.33756, "grad_norm": 2.109375, "grad_norm_var": 0.010284169514973959, "learning_rate": 0.0001, "loss": 4.237, "loss/crossentropy": 2.1505807638168335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20921844244003296, "step": 16878 }, { "epoch": 0.3376, "grad_norm": 1.859375, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.0692, "loss/crossentropy": 1.7159577012062073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19649318605661392, "step": 16880 }, { "epoch": 0.33764, "grad_norm": 1.84375, "grad_norm_var": 0.010251617431640625, "learning_rate": 0.0001, "loss": 4.1615, "loss/crossentropy": 2.187040388584137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20251552015542984, "step": 16882 }, { "epoch": 0.33768, "grad_norm": 2.234375, "grad_norm_var": 0.014609527587890626, "learning_rate": 0.0001, "loss": 4.0587, "loss/crossentropy": 1.9247627258300781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17710037529468536, "step": 16884 }, { "epoch": 0.33772, "grad_norm": 1.9375, "grad_norm_var": 0.0141845703125, "learning_rate": 0.0001, "loss": 4.001, "loss/crossentropy": 1.8499276041984558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18950388580560684, "step": 16886 }, { "epoch": 0.33776, "grad_norm": 1.8828125, "grad_norm_var": 0.0138824462890625, "learning_rate": 0.0001, "loss": 4.0998, "loss/crossentropy": 2.0127341747283936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18379998207092285, "step": 16888 }, { "epoch": 0.3378, "grad_norm": 1.9375, "grad_norm_var": 0.013602447509765626, "learning_rate": 0.0001, "loss": 3.9096, "loss/crossentropy": 2.2561115026474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20785757154226303, "step": 16890 }, { "epoch": 0.33784, "grad_norm": 1.9375, "grad_norm_var": 0.0131103515625, "learning_rate": 0.0001, "loss": 3.9638, "loss/crossentropy": 1.9391398429870605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19481037557125092, "step": 16892 }, { "epoch": 0.33788, "grad_norm": 1.8984375, "grad_norm_var": 0.014827219645182292, "learning_rate": 0.0001, "loss": 4.2368, "loss/crossentropy": 2.3129481077194214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2065483033657074, "step": 16894 }, { "epoch": 0.33792, "grad_norm": 2.296875, "grad_norm_var": 0.01984430948893229, "learning_rate": 0.0001, "loss": 4.0774, "loss/crossentropy": 1.8482372760772705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19291523844003677, "step": 16896 }, { "epoch": 0.33796, "grad_norm": 2.03125, "grad_norm_var": 0.020031483968098958, "learning_rate": 0.0001, "loss": 4.3876, "loss/crossentropy": 1.973912537097931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22137422114610672, "step": 16898 }, { "epoch": 0.338, "grad_norm": 1.890625, "grad_norm_var": 0.01654841105143229, "learning_rate": 0.0001, "loss": 3.9599, "loss/crossentropy": 1.7987132668495178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18385548144578934, "step": 16900 }, { "epoch": 0.33804, "grad_norm": 1.890625, "grad_norm_var": 0.01727879842122396, "learning_rate": 0.0001, "loss": 4.3673, "loss/crossentropy": 2.199060797691345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016109600663185, "step": 16902 }, { "epoch": 0.33808, "grad_norm": 3.046875, "grad_norm_var": 0.0830718994140625, "learning_rate": 0.0001, "loss": 4.27, "loss/crossentropy": 2.179081439971924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21342043578624725, "step": 16904 }, { "epoch": 0.33812, "grad_norm": 1.9375, "grad_norm_var": 0.0810455322265625, "learning_rate": 0.0001, "loss": 3.9861, "loss/crossentropy": 1.953968107700348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20462830364704132, "step": 16906 }, { "epoch": 0.33816, "grad_norm": 2.0, "grad_norm_var": 0.08123270670572917, "learning_rate": 0.0001, "loss": 4.3432, "loss/crossentropy": 2.3202494382858276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21390480548143387, "step": 16908 }, { "epoch": 0.3382, "grad_norm": 2.15625, "grad_norm_var": 0.07872899373372395, "learning_rate": 0.0001, "loss": 4.3085, "loss/crossentropy": 2.085767686367035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20946675539016724, "step": 16910 }, { "epoch": 0.33824, "grad_norm": 2.0625, "grad_norm_var": 0.07372945149739583, "learning_rate": 0.0001, "loss": 4.217, "loss/crossentropy": 1.789705514907837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2247294932603836, "step": 16912 }, { "epoch": 0.33828, "grad_norm": 1.9140625, "grad_norm_var": 0.07440770467122396, "learning_rate": 0.0001, "loss": 4.0409, "loss/crossentropy": 1.9354140758514404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17958863079547882, "step": 16914 }, { "epoch": 0.33832, "grad_norm": 2.015625, "grad_norm_var": 0.07529296875, "learning_rate": 0.0001, "loss": 3.6949, "loss/crossentropy": 1.878045916557312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1812344640493393, "step": 16916 }, { "epoch": 0.33836, "grad_norm": 1.8359375, "grad_norm_var": 0.0769488016764323, "learning_rate": 0.0001, "loss": 3.9392, "loss/crossentropy": 1.7474132776260376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19444099068641663, "step": 16918 }, { "epoch": 0.3384, "grad_norm": 1.9453125, "grad_norm_var": 0.007185618082682292, "learning_rate": 0.0001, "loss": 4.1524, "loss/crossentropy": 2.0462751984596252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20077970623970032, "step": 16920 }, { "epoch": 0.33844, "grad_norm": 2.03125, "grad_norm_var": 0.006912994384765625, "learning_rate": 0.0001, "loss": 4.205, "loss/crossentropy": 2.1104917526245117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21799270808696747, "step": 16922 }, { "epoch": 0.33848, "grad_norm": 2.0625, "grad_norm_var": 0.007083892822265625, "learning_rate": 0.0001, "loss": 4.1915, "loss/crossentropy": 2.11525696516037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20585542917251587, "step": 16924 }, { "epoch": 0.33852, "grad_norm": 1.96875, "grad_norm_var": 0.005222320556640625, "learning_rate": 0.0001, "loss": 4.3629, "loss/crossentropy": 2.3451485633850098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22672247886657715, "step": 16926 }, { "epoch": 0.33856, "grad_norm": 2.03125, "grad_norm_var": 0.004906209309895834, "learning_rate": 0.0001, "loss": 4.0493, "loss/crossentropy": 1.8872862458229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973365843296051, "step": 16928 }, { "epoch": 0.3386, "grad_norm": 2.1875, "grad_norm_var": 0.00777587890625, "learning_rate": 0.0001, "loss": 4.1082, "loss/crossentropy": 1.9465213418006897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2297648787498474, "step": 16930 }, { "epoch": 0.33864, "grad_norm": 1.96875, "grad_norm_var": 0.0084625244140625, "learning_rate": 0.0001, "loss": 3.9397, "loss/crossentropy": 1.7463516592979431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193784698843956, "step": 16932 }, { "epoch": 0.33868, "grad_norm": 1.984375, "grad_norm_var": 0.006738026936848958, "learning_rate": 0.0001, "loss": 4.1561, "loss/crossentropy": 2.2214877605438232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064037024974823, "step": 16934 }, { "epoch": 0.33872, "grad_norm": 1.9765625, "grad_norm_var": 0.007600657145182292, "learning_rate": 0.0001, "loss": 4.0355, "loss/crossentropy": 1.896401822566986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21860874444246292, "step": 16936 }, { "epoch": 0.33876, "grad_norm": 1.9609375, "grad_norm_var": 0.007209269205729166, "learning_rate": 0.0001, "loss": 4.0701, "loss/crossentropy": 2.508669376373291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23076221346855164, "step": 16938 }, { "epoch": 0.3388, "grad_norm": 2.09375, "grad_norm_var": 0.009266916910807292, "learning_rate": 0.0001, "loss": 3.8421, "loss/crossentropy": 1.9412622451782227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1833793818950653, "step": 16940 }, { "epoch": 0.33884, "grad_norm": 2.484375, "grad_norm_var": 0.025406646728515624, "learning_rate": 0.0001, "loss": 4.362, "loss/crossentropy": 2.4757901430130005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23629899322986603, "step": 16942 }, { "epoch": 0.33888, "grad_norm": 2.046875, "grad_norm_var": 0.02580744425455729, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 1.788071870803833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894787922501564, "step": 16944 }, { "epoch": 0.33892, "grad_norm": 1.9296875, "grad_norm_var": 0.023436482747395834, "learning_rate": 0.0001, "loss": 4.0626, "loss/crossentropy": 2.1902449131011963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21310994774103165, "step": 16946 }, { "epoch": 0.33896, "grad_norm": 1.96875, "grad_norm_var": 0.021394856770833335, "learning_rate": 0.0001, "loss": 3.9828, "loss/crossentropy": 2.0444132685661316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1755325198173523, "step": 16948 }, { "epoch": 0.339, "grad_norm": 2.09375, "grad_norm_var": 0.02200902303059896, "learning_rate": 0.0001, "loss": 4.1609, "loss/crossentropy": 1.9242961406707764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18970132619142532, "step": 16950 }, { "epoch": 0.33904, "grad_norm": 1.90625, "grad_norm_var": 0.0216552734375, "learning_rate": 0.0001, "loss": 3.9819, "loss/crossentropy": 1.7046860456466675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19807563722133636, "step": 16952 }, { "epoch": 0.33908, "grad_norm": 2.0625, "grad_norm_var": 0.02184015909830729, "learning_rate": 0.0001, "loss": 3.8975, "loss/crossentropy": 1.8960286974906921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17571083456277847, "step": 16954 }, { "epoch": 0.33912, "grad_norm": 1.9296875, "grad_norm_var": 0.019481404622395834, "learning_rate": 0.0001, "loss": 3.922, "loss/crossentropy": 1.9351304769515991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918834000825882, "step": 16956 }, { "epoch": 0.33916, "grad_norm": 2.453125, "grad_norm_var": 0.017728424072265624, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 1.974304735660553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20828718692064285, "step": 16958 }, { "epoch": 0.3392, "grad_norm": 1.921875, "grad_norm_var": 0.017867024739583334, "learning_rate": 0.0001, "loss": 3.8506, "loss/crossentropy": 2.3836190700531006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074722871184349, "step": 16960 }, { "epoch": 0.33924, "grad_norm": 1.875, "grad_norm_var": 0.018123372395833334, "learning_rate": 0.0001, "loss": 4.0827, "loss/crossentropy": 2.1663198471069336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20097027719020844, "step": 16962 }, { "epoch": 0.33928, "grad_norm": 2.03125, "grad_norm_var": 0.01810277303059896, "learning_rate": 0.0001, "loss": 4.2466, "loss/crossentropy": 2.275226354598999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22942040860652924, "step": 16964 }, { "epoch": 0.33932, "grad_norm": 1.9375, "grad_norm_var": 0.017796834309895832, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 1.7741501331329346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822734773159027, "step": 16966 }, { "epoch": 0.33936, "grad_norm": 1.984375, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 3.9016, "loss/crossentropy": 1.791014850139618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008860930800438, "step": 16968 }, { "epoch": 0.3394, "grad_norm": 2.046875, "grad_norm_var": 0.016888173421223958, "learning_rate": 0.0001, "loss": 4.1941, "loss/crossentropy": 1.9239555597305298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192842036485672, "step": 16970 }, { "epoch": 0.33944, "grad_norm": 2.09375, "grad_norm_var": 0.016788482666015625, "learning_rate": 0.0001, "loss": 4.108, "loss/crossentropy": 2.1543636322021484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20888526737689972, "step": 16972 }, { "epoch": 0.33948, "grad_norm": 1.84375, "grad_norm_var": 0.005012003580729166, "learning_rate": 0.0001, "loss": 3.975, "loss/crossentropy": 2.2322418093681335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19024743884801865, "step": 16974 }, { "epoch": 0.33952, "grad_norm": 2.109375, "grad_norm_var": 0.012422434488932292, "learning_rate": 0.0001, "loss": 4.3208, "loss/crossentropy": 1.9934163093566895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26193471997976303, "step": 16976 }, { "epoch": 0.33956, "grad_norm": 2.03125, "grad_norm_var": 0.013244374593098959, "learning_rate": 0.0001, "loss": 4.2753, "loss/crossentropy": 2.340088725090027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110593616962433, "step": 16978 }, { "epoch": 0.3396, "grad_norm": 1.890625, "grad_norm_var": 0.03021214803059896, "learning_rate": 0.0001, "loss": 4.2047, "loss/crossentropy": 2.057206869125366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071269005537033, "step": 16980 }, { "epoch": 0.33964, "grad_norm": 1.953125, "grad_norm_var": 0.030210113525390624, "learning_rate": 0.0001, "loss": 4.0262, "loss/crossentropy": 2.002032458782196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19903026521205902, "step": 16982 }, { "epoch": 0.33968, "grad_norm": 2.0, "grad_norm_var": 0.030804189046223958, "learning_rate": 0.0001, "loss": 4.246, "loss/crossentropy": 2.01567679643631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20198361575603485, "step": 16984 }, { "epoch": 0.33972, "grad_norm": 1.8515625, "grad_norm_var": 0.032814280192057295, "learning_rate": 0.0001, "loss": 4.0606, "loss/crossentropy": 1.661778211593628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16601823270320892, "step": 16986 }, { "epoch": 0.33976, "grad_norm": 2.015625, "grad_norm_var": 0.03467203776041667, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 2.284485101699829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20108629018068314, "step": 16988 }, { "epoch": 0.3398, "grad_norm": 2.03125, "grad_norm_var": 0.03200861612955729, "learning_rate": 0.0001, "loss": 4.1066, "loss/crossentropy": 2.1860578656196594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22195414453744888, "step": 16990 }, { "epoch": 0.33984, "grad_norm": 1.984375, "grad_norm_var": 0.02719904581705729, "learning_rate": 0.0001, "loss": 4.0212, "loss/crossentropy": 2.032013416290283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19930274784564972, "step": 16992 }, { "epoch": 0.33988, "grad_norm": 2.0625, "grad_norm_var": 0.1898577372233073, "learning_rate": 0.0001, "loss": 4.2847, "loss/crossentropy": 2.227185010910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19806954264640808, "step": 16994 }, { "epoch": 0.33992, "grad_norm": 1.9609375, "grad_norm_var": 0.17535807291666666, "learning_rate": 0.0001, "loss": 4.1526, "loss/crossentropy": 2.222020983695984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2354518324136734, "step": 16996 }, { "epoch": 0.33996, "grad_norm": 2.109375, "grad_norm_var": 0.1755767822265625, "learning_rate": 0.0001, "loss": 4.1766, "loss/crossentropy": 2.336432456970215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555255770683289, "step": 16998 }, { "epoch": 0.34, "grad_norm": 1.96875, "grad_norm_var": 0.1748443603515625, "learning_rate": 0.0001, "loss": 4.3312, "loss/crossentropy": 2.17374986410141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21119412034749985, "step": 17000 }, { "epoch": 0.34004, "grad_norm": 2.203125, "grad_norm_var": 0.17195536295572916, "learning_rate": 0.0001, "loss": 4.1445, "loss/crossentropy": 2.135382056236267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2263253927230835, "step": 17002 }, { "epoch": 0.34008, "grad_norm": 2.09375, "grad_norm_var": 0.20149917602539064, "learning_rate": 0.0001, "loss": 3.811, "loss/crossentropy": 1.9646037220954895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18890808522701263, "step": 17004 }, { "epoch": 0.34012, "grad_norm": 1.921875, "grad_norm_var": 0.20098368326822916, "learning_rate": 0.0001, "loss": 4.0889, "loss/crossentropy": 1.8009583353996277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882813572883606, "step": 17006 }, { "epoch": 0.34016, "grad_norm": 3.109375, "grad_norm_var": 0.24433492024739584, "learning_rate": 0.0001, "loss": 4.4921, "loss/crossentropy": 2.0362982153892517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208044171333313, "step": 17008 }, { "epoch": 0.3402, "grad_norm": 1.9765625, "grad_norm_var": 0.11477432250976563, "learning_rate": 0.0001, "loss": 4.1781, "loss/crossentropy": 2.2055057287216187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21141493320465088, "step": 17010 }, { "epoch": 0.34024, "grad_norm": 2.0, "grad_norm_var": 0.11963475545247396, "learning_rate": 0.0001, "loss": 3.772, "loss/crossentropy": 1.386088252067566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16025834530591965, "step": 17012 }, { "epoch": 0.34028, "grad_norm": 2.1875, "grad_norm_var": 0.11599299112955729, "learning_rate": 0.0001, "loss": 4.2389, "loss/crossentropy": 2.131182312965393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20460083335638046, "step": 17014 }, { "epoch": 0.34032, "grad_norm": 2.015625, "grad_norm_var": 0.11592992146809895, "learning_rate": 0.0001, "loss": 4.2681, "loss/crossentropy": 2.2682281732559204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22915159910917282, "step": 17016 }, { "epoch": 0.34036, "grad_norm": 1.96875, "grad_norm_var": 0.12011693318684896, "learning_rate": 0.0001, "loss": 4.0521, "loss/crossentropy": 2.0855059027671814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19398894160985947, "step": 17018 }, { "epoch": 0.3404, "grad_norm": 1.9921875, "grad_norm_var": 0.0842437744140625, "learning_rate": 0.0001, "loss": 4.0421, "loss/crossentropy": 1.895095944404602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172701731324196, "step": 17020 }, { "epoch": 0.34044, "grad_norm": 2.0, "grad_norm_var": 0.08455403645833333, "learning_rate": 0.0001, "loss": 4.0814, "loss/crossentropy": 2.10223788022995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144509255886078, "step": 17022 }, { "epoch": 0.34048, "grad_norm": 2.03125, "grad_norm_var": 0.014241282145182292, "learning_rate": 0.0001, "loss": 4.1211, "loss/crossentropy": 2.1131081581115723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21176592260599136, "step": 17024 }, { "epoch": 0.34052, "grad_norm": 2.0, "grad_norm_var": 0.009991200764973958, "learning_rate": 0.0001, "loss": 4.1581, "loss/crossentropy": 2.1125452518463135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931193768978119, "step": 17026 }, { "epoch": 0.34056, "grad_norm": 1.8046875, "grad_norm_var": 0.017775217692057293, "learning_rate": 0.0001, "loss": 4.2624, "loss/crossentropy": 2.213385283946991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2576863020658493, "step": 17028 }, { "epoch": 0.3406, "grad_norm": 2.171875, "grad_norm_var": 0.017773183186848958, "learning_rate": 0.0001, "loss": 4.408, "loss/crossentropy": 2.0793206095695496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20338336378335953, "step": 17030 }, { "epoch": 0.34064, "grad_norm": 2.078125, "grad_norm_var": 0.01807225545247396, "learning_rate": 0.0001, "loss": 4.3102, "loss/crossentropy": 1.8870239853858948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18544750660657883, "step": 17032 }, { "epoch": 0.34068, "grad_norm": 1.9609375, "grad_norm_var": 0.017254384358723958, "learning_rate": 0.0001, "loss": 4.1538, "loss/crossentropy": 1.9893989562988281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19679997861385345, "step": 17034 }, { "epoch": 0.34072, "grad_norm": 1.9609375, "grad_norm_var": 0.01845270792643229, "learning_rate": 0.0001, "loss": 3.9447, "loss/crossentropy": 2.0401668548583984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19242482632398605, "step": 17036 }, { "epoch": 0.34076, "grad_norm": 2.109375, "grad_norm_var": 0.015472157796223959, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 2.0786415934562683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21781006455421448, "step": 17038 }, { "epoch": 0.3408, "grad_norm": 1.9296875, "grad_norm_var": 0.015533192952473959, "learning_rate": 0.0001, "loss": 3.9472, "loss/crossentropy": 2.143572211265564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20608216524124146, "step": 17040 }, { "epoch": 0.34084, "grad_norm": 2.0625, "grad_norm_var": 0.0154693603515625, "learning_rate": 0.0001, "loss": 4.3305, "loss/crossentropy": 2.0578572750091553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22006108611822128, "step": 17042 }, { "epoch": 0.34088, "grad_norm": 1.9609375, "grad_norm_var": 0.0061920166015625, "learning_rate": 0.0001, "loss": 4.1295, "loss/crossentropy": 2.1016032099723816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059948667883873, "step": 17044 }, { "epoch": 0.34092, "grad_norm": 1.9609375, "grad_norm_var": 0.00438232421875, "learning_rate": 0.0001, "loss": 3.9293, "loss/crossentropy": 1.814449965953827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1946963220834732, "step": 17046 }, { "epoch": 0.34096, "grad_norm": 1.9453125, "grad_norm_var": 0.004233551025390625, "learning_rate": 0.0001, "loss": 4.0696, "loss/crossentropy": 2.2421000599861145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2114911824464798, "step": 17048 }, { "epoch": 0.341, "grad_norm": 2.125, "grad_norm_var": 0.0070231119791666664, "learning_rate": 0.0001, "loss": 3.984, "loss/crossentropy": 2.1200218200683594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212583489716053, "step": 17050 }, { "epoch": 0.34104, "grad_norm": 1.921875, "grad_norm_var": 0.007441965738932291, "learning_rate": 0.0001, "loss": 3.8656, "loss/crossentropy": 1.9486305713653564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18631484359502792, "step": 17052 }, { "epoch": 0.34108, "grad_norm": 1.9921875, "grad_norm_var": 0.007721964518229167, "learning_rate": 0.0001, "loss": 4.3888, "loss/crossentropy": 2.4838292598724365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271672427654266, "step": 17054 }, { "epoch": 0.34112, "grad_norm": 1.96875, "grad_norm_var": 0.008137766520182292, "learning_rate": 0.0001, "loss": 3.9099, "loss/crossentropy": 1.8606489896774292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955883800983429, "step": 17056 }, { "epoch": 0.34116, "grad_norm": 2.15625, "grad_norm_var": 0.013199615478515624, "learning_rate": 0.0001, "loss": 4.2847, "loss/crossentropy": 2.1702204942703247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21619878709316254, "step": 17058 }, { "epoch": 0.3412, "grad_norm": 2.015625, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 4.1987, "loss/crossentropy": 2.1011393070220947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2120063453912735, "step": 17060 }, { "epoch": 0.34124, "grad_norm": 1.953125, "grad_norm_var": 0.013142903645833334, "learning_rate": 0.0001, "loss": 4.1704, "loss/crossentropy": 2.267427682876587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20903929322957993, "step": 17062 }, { "epoch": 0.34128, "grad_norm": 1.984375, "grad_norm_var": 0.01620457967122396, "learning_rate": 0.0001, "loss": 4.3312, "loss/crossentropy": 2.125304937362671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18540818989276886, "step": 17064 }, { "epoch": 0.34132, "grad_norm": 1.90625, "grad_norm_var": 0.013925933837890625, "learning_rate": 0.0001, "loss": 3.9552, "loss/crossentropy": 1.8741024136543274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19243815541267395, "step": 17066 }, { "epoch": 0.34136, "grad_norm": 2.46875, "grad_norm_var": 0.023451487223307293, "learning_rate": 0.0001, "loss": 4.0252, "loss/crossentropy": 2.12781822681427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23974300920963287, "step": 17068 }, { "epoch": 0.3414, "grad_norm": 2.15625, "grad_norm_var": 0.02448298136393229, "learning_rate": 0.0001, "loss": 4.2728, "loss/crossentropy": 2.0786141753196716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22311320155858994, "step": 17070 }, { "epoch": 0.34144, "grad_norm": 1.9140625, "grad_norm_var": 0.02408421834309896, "learning_rate": 0.0001, "loss": 4.1686, "loss/crossentropy": 2.1569892168045044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20391946285963058, "step": 17072 }, { "epoch": 0.34148, "grad_norm": 1.90625, "grad_norm_var": 0.024607086181640626, "learning_rate": 0.0001, "loss": 3.8035, "loss/crossentropy": 1.8860353231430054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937229335308075, "step": 17074 }, { "epoch": 0.34152, "grad_norm": 2.0, "grad_norm_var": 0.02448094685872396, "learning_rate": 0.0001, "loss": 4.0993, "loss/crossentropy": 2.25645911693573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088249772787094, "step": 17076 }, { "epoch": 0.34156, "grad_norm": 2.328125, "grad_norm_var": 0.02972997029622396, "learning_rate": 0.0001, "loss": 4.3486, "loss/crossentropy": 2.379481792449951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2341819554567337, "step": 17078 }, { "epoch": 0.3416, "grad_norm": 1.9296875, "grad_norm_var": 0.0273345947265625, "learning_rate": 0.0001, "loss": 4.0841, "loss/crossentropy": 2.1564733386039734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2162168025970459, "step": 17080 }, { "epoch": 0.34164, "grad_norm": 3.015625, "grad_norm_var": 0.0860260009765625, "learning_rate": 0.0001, "loss": 4.2526, "loss/crossentropy": 1.8906886577606201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18614360690116882, "step": 17082 }, { "epoch": 0.34168, "grad_norm": 2.03125, "grad_norm_var": 0.07732747395833334, "learning_rate": 0.0001, "loss": 4.0135, "loss/crossentropy": 2.093637704849243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19867172092199326, "step": 17084 }, { "epoch": 0.34172, "grad_norm": 2.03125, "grad_norm_var": 0.0760210673014323, "learning_rate": 0.0001, "loss": 4.2364, "loss/crossentropy": 2.2579731941223145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2153376340866089, "step": 17086 }, { "epoch": 0.34176, "grad_norm": 2.15625, "grad_norm_var": 0.07312393188476562, "learning_rate": 0.0001, "loss": 4.3275, "loss/crossentropy": 2.250504732131958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2759099751710892, "step": 17088 }, { "epoch": 0.3418, "grad_norm": 1.921875, "grad_norm_var": 0.07190653483072916, "learning_rate": 0.0001, "loss": 3.8977, "loss/crossentropy": 1.6062138676643372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15309542417526245, "step": 17090 }, { "epoch": 0.34184, "grad_norm": 1.8828125, "grad_norm_var": 0.07587865193684896, "learning_rate": 0.0001, "loss": 4.2199, "loss/crossentropy": 2.227494239807129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20563329756259918, "step": 17092 }, { "epoch": 0.34188, "grad_norm": 1.921875, "grad_norm_var": 0.07314631144205729, "learning_rate": 0.0001, "loss": 4.1095, "loss/crossentropy": 2.1047890186309814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19775691628456116, "step": 17094 }, { "epoch": 0.34192, "grad_norm": 2.015625, "grad_norm_var": 0.07301813761393229, "learning_rate": 0.0001, "loss": 4.0508, "loss/crossentropy": 2.044450581073761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19867002964019775, "step": 17096 }, { "epoch": 0.34196, "grad_norm": 1.953125, "grad_norm_var": 0.009511057535807292, "learning_rate": 0.0001, "loss": 4.2319, "loss/crossentropy": 1.9139947891235352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20143548399209976, "step": 17098 }, { "epoch": 0.342, "grad_norm": 1.9375, "grad_norm_var": 0.008162180582682291, "learning_rate": 0.0001, "loss": 3.7159, "loss/crossentropy": 1.6752634048461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17691873013973236, "step": 17100 }, { "epoch": 0.34204, "grad_norm": 1.9140625, "grad_norm_var": 0.0156402587890625, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 2.277552843093872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049017697572708, "step": 17102 }, { "epoch": 0.34208, "grad_norm": 1.9765625, "grad_norm_var": 0.015608469645182291, "learning_rate": 0.0001, "loss": 4.0663, "loss/crossentropy": 2.458711266517639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21726765483617783, "step": 17104 }, { "epoch": 0.34212, "grad_norm": 2.171875, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 4.0384, "loss/crossentropy": 1.9302194714546204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19446631520986557, "step": 17106 }, { "epoch": 0.34216, "grad_norm": 1.890625, "grad_norm_var": 0.01622314453125, "learning_rate": 0.0001, "loss": 3.8935, "loss/crossentropy": 2.021213114261627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19394385814666748, "step": 17108 }, { "epoch": 0.3422, "grad_norm": 2.0, "grad_norm_var": 0.01689453125, "learning_rate": 0.0001, "loss": 4.4094, "loss/crossentropy": 2.4856090545654297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156004160642624, "step": 17110 }, { "epoch": 0.34224, "grad_norm": 2.09375, "grad_norm_var": 0.016869862874348957, "learning_rate": 0.0001, "loss": 4.4135, "loss/crossentropy": 2.410070061683655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21203476190567017, "step": 17112 }, { "epoch": 0.34228, "grad_norm": 1.9765625, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 4.0484, "loss/crossentropy": 2.026827871799469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20053605735301971, "step": 17114 }, { "epoch": 0.34232, "grad_norm": 2.09375, "grad_norm_var": 0.016283162434895835, "learning_rate": 0.0001, "loss": 4.2319, "loss/crossentropy": 2.1833966970443726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193789780139923, "step": 17116 }, { "epoch": 0.34236, "grad_norm": 2.109375, "grad_norm_var": 0.010798899332682292, "learning_rate": 0.0001, "loss": 4.0921, "loss/crossentropy": 2.137472152709961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21218669414520264, "step": 17118 }, { "epoch": 0.3424, "grad_norm": 1.96875, "grad_norm_var": 0.013741048177083333, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 2.0607420206069946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138473093509674, "step": 17120 }, { "epoch": 0.34244, "grad_norm": 1.984375, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 4.1787, "loss/crossentropy": 1.9109066724777222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20130306482315063, "step": 17122 }, { "epoch": 0.34248, "grad_norm": 1.9453125, "grad_norm_var": 0.010944620768229166, "learning_rate": 0.0001, "loss": 3.7531, "loss/crossentropy": 2.1049917936325073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20878130197525024, "step": 17124 }, { "epoch": 0.34252, "grad_norm": 2.03125, "grad_norm_var": 0.010008748372395833, "learning_rate": 0.0001, "loss": 4.0951, "loss/crossentropy": 1.9087567925453186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18959704041481018, "step": 17126 }, { "epoch": 0.34256, "grad_norm": 2.125, "grad_norm_var": 0.010033162434895833, "learning_rate": 0.0001, "loss": 4.3854, "loss/crossentropy": 2.491095781326294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21739569306373596, "step": 17128 }, { "epoch": 0.3426, "grad_norm": 1.9609375, "grad_norm_var": 0.0081207275390625, "learning_rate": 0.0001, "loss": 4.0227, "loss/crossentropy": 2.045976400375366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18868304044008255, "step": 17130 }, { "epoch": 0.34264, "grad_norm": 2.109375, "grad_norm_var": 0.009318033854166666, "learning_rate": 0.0001, "loss": 4.1221, "loss/crossentropy": 2.021378517150879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955355852842331, "step": 17132 }, { "epoch": 0.34268, "grad_norm": 1.9140625, "grad_norm_var": 0.010247548421223959, "learning_rate": 0.0001, "loss": 4.0244, "loss/crossentropy": 2.052547872066498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18654850870370865, "step": 17134 }, { "epoch": 0.34272, "grad_norm": 2.09375, "grad_norm_var": 0.008886464436848958, "learning_rate": 0.0001, "loss": 4.2536, "loss/crossentropy": 2.1709023118019104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21335267275571823, "step": 17136 }, { "epoch": 0.34276, "grad_norm": 1.921875, "grad_norm_var": 0.010477701822916666, "learning_rate": 0.0001, "loss": 4.1682, "loss/crossentropy": 2.310550093650818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18719495832920074, "step": 17138 }, { "epoch": 0.3428, "grad_norm": 2.046875, "grad_norm_var": 0.011262003580729167, "learning_rate": 0.0001, "loss": 4.0145, "loss/crossentropy": 1.9357584714889526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914527416229248, "step": 17140 }, { "epoch": 0.34284, "grad_norm": 1.9140625, "grad_norm_var": 0.012684885660807292, "learning_rate": 0.0001, "loss": 3.9673, "loss/crossentropy": 1.7561541199684143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1711764633655548, "step": 17142 }, { "epoch": 0.34288, "grad_norm": 2.515625, "grad_norm_var": 0.7546994527180989, "learning_rate": 0.0001, "loss": 4.562, "loss/crossentropy": 2.4081480503082275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2405518889427185, "step": 17144 }, { "epoch": 0.34292, "grad_norm": 2.34375, "grad_norm_var": 0.7456451416015625, "learning_rate": 0.0001, "loss": 4.4023, "loss/crossentropy": 2.44531512260437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23487353324890137, "step": 17146 }, { "epoch": 0.34296, "grad_norm": 1.9296875, "grad_norm_var": 0.7445696512858073, "learning_rate": 0.0001, "loss": 4.0845, "loss/crossentropy": 1.9712305068969727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20045534521341324, "step": 17148 }, { "epoch": 0.343, "grad_norm": 1.890625, "grad_norm_var": 0.7468706766764323, "learning_rate": 0.0001, "loss": 3.8479, "loss/crossentropy": 1.9718485474586487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18550659716129303, "step": 17150 }, { "epoch": 0.34304, "grad_norm": 1.9453125, "grad_norm_var": 0.7527414957682291, "learning_rate": 0.0001, "loss": 4.2708, "loss/crossentropy": 2.1901930570602417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082839384675026, "step": 17152 }, { "epoch": 0.34308, "grad_norm": 1.953125, "grad_norm_var": 0.7463905334472656, "learning_rate": 0.0001, "loss": 4.2104, "loss/crossentropy": 2.098864793777466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19527551531791687, "step": 17154 }, { "epoch": 0.34312, "grad_norm": 2.0, "grad_norm_var": 0.7467437744140625, "learning_rate": 0.0001, "loss": 3.9873, "loss/crossentropy": 1.7448402643203735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1862412989139557, "step": 17156 }, { "epoch": 0.34316, "grad_norm": 2.109375, "grad_norm_var": 0.7429603576660156, "learning_rate": 0.0001, "loss": 4.1016, "loss/crossentropy": 1.9973859190940857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19645331799983978, "step": 17158 }, { "epoch": 0.3432, "grad_norm": 1.8515625, "grad_norm_var": 0.016874186197916665, "learning_rate": 0.0001, "loss": 4.2022, "loss/crossentropy": 2.2570079565048218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20290197432041168, "step": 17160 }, { "epoch": 0.34324, "grad_norm": 2.15625, "grad_norm_var": 0.008642323811848958, "learning_rate": 0.0001, "loss": 4.2304, "loss/crossentropy": 1.7943695187568665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924503892660141, "step": 17162 }, { "epoch": 0.34328, "grad_norm": 2.0, "grad_norm_var": 0.008225250244140624, "learning_rate": 0.0001, "loss": 4.2142, "loss/crossentropy": 2.1875799894332886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968172788619995, "step": 17164 }, { "epoch": 0.34332, "grad_norm": 2.0625, "grad_norm_var": 0.007477823893229167, "learning_rate": 0.0001, "loss": 4.1844, "loss/crossentropy": 2.242374360561371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21981361508369446, "step": 17166 }, { "epoch": 0.34336, "grad_norm": 2.046875, "grad_norm_var": 0.006754302978515625, "learning_rate": 0.0001, "loss": 4.0406, "loss/crossentropy": 1.8542492985725403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19412916153669357, "step": 17168 }, { "epoch": 0.3434, "grad_norm": 1.9765625, "grad_norm_var": 0.0067860921223958336, "learning_rate": 0.0001, "loss": 3.8804, "loss/crossentropy": 1.9377904534339905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20015903562307358, "step": 17170 }, { "epoch": 0.34344, "grad_norm": 1.9765625, "grad_norm_var": 0.006319173177083333, "learning_rate": 0.0001, "loss": 4.1691, "loss/crossentropy": 2.2250781655311584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1943892240524292, "step": 17172 }, { "epoch": 0.34348, "grad_norm": 1.984375, "grad_norm_var": 0.0054094950358072914, "learning_rate": 0.0001, "loss": 4.0274, "loss/crossentropy": 2.0463303923606873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931215077638626, "step": 17174 }, { "epoch": 0.34352, "grad_norm": 2.078125, "grad_norm_var": 0.004984537760416667, "learning_rate": 0.0001, "loss": 3.826, "loss/crossentropy": 2.0576277375221252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24007725715637207, "step": 17176 }, { "epoch": 0.34356, "grad_norm": 1.859375, "grad_norm_var": 0.003999582926432292, "learning_rate": 0.0001, "loss": 3.9865, "loss/crossentropy": 1.8423307538032532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17171106487512589, "step": 17178 }, { "epoch": 0.3436, "grad_norm": 1.984375, "grad_norm_var": 0.0053375244140625, "learning_rate": 0.0001, "loss": 4.3089, "loss/crossentropy": 2.05659818649292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20058965682983398, "step": 17180 }, { "epoch": 0.34364, "grad_norm": 2.046875, "grad_norm_var": 0.0055620829264322914, "learning_rate": 0.0001, "loss": 3.8965, "loss/crossentropy": 1.926946997642517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1770440638065338, "step": 17182 }, { "epoch": 0.34368, "grad_norm": 2.015625, "grad_norm_var": 0.005098215738932292, "learning_rate": 0.0001, "loss": 3.9781, "loss/crossentropy": 1.8181686401367188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19718249142169952, "step": 17184 }, { "epoch": 0.34372, "grad_norm": 1.8203125, "grad_norm_var": 0.006624094645182292, "learning_rate": 0.0001, "loss": 4.0732, "loss/crossentropy": 2.021128237247467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968582272529602, "step": 17186 }, { "epoch": 0.34376, "grad_norm": 2.109375, "grad_norm_var": 0.008365631103515625, "learning_rate": 0.0001, "loss": 4.2861, "loss/crossentropy": 2.04776668548584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22718969732522964, "step": 17188 }, { "epoch": 0.3438, "grad_norm": 1.890625, "grad_norm_var": 0.008957672119140624, "learning_rate": 0.0001, "loss": 4.2196, "loss/crossentropy": 2.2225213050842285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20369096100330353, "step": 17190 }, { "epoch": 0.34384, "grad_norm": 1.9609375, "grad_norm_var": 0.008089192708333333, "learning_rate": 0.0001, "loss": 4.1137, "loss/crossentropy": 1.8544179201126099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20437808334827423, "step": 17192 }, { "epoch": 0.34388, "grad_norm": 1.9921875, "grad_norm_var": 0.006730143229166667, "learning_rate": 0.0001, "loss": 4.387, "loss/crossentropy": 2.1270517110824585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997273936867714, "step": 17194 }, { "epoch": 0.34392, "grad_norm": 1.984375, "grad_norm_var": 0.0060791015625, "learning_rate": 0.0001, "loss": 3.8062, "loss/crossentropy": 1.8699182271957397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19382557272911072, "step": 17196 }, { "epoch": 0.34396, "grad_norm": 2.046875, "grad_norm_var": 0.006941731770833333, "learning_rate": 0.0001, "loss": 3.9864, "loss/crossentropy": 1.8734498023986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18053626269102097, "step": 17198 }, { "epoch": 0.344, "grad_norm": 1.890625, "grad_norm_var": 0.008097330729166666, "learning_rate": 0.0001, "loss": 3.9234, "loss/crossentropy": 1.9386133551597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851777583360672, "step": 17200 }, { "epoch": 0.34404, "grad_norm": 2.015625, "grad_norm_var": 0.0069163004557291664, "learning_rate": 0.0001, "loss": 3.9619, "loss/crossentropy": 2.1074278354644775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22176695615053177, "step": 17202 }, { "epoch": 0.34408, "grad_norm": 1.9296875, "grad_norm_var": 0.005895741780598958, "learning_rate": 0.0001, "loss": 4.1231, "loss/crossentropy": 1.826387107372284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19368141889572144, "step": 17204 }, { "epoch": 0.34412, "grad_norm": 1.9921875, "grad_norm_var": 0.005680084228515625, "learning_rate": 0.0001, "loss": 4.1367, "loss/crossentropy": 1.9109328389167786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023227959871292, "step": 17206 }, { "epoch": 0.34416, "grad_norm": 2.09375, "grad_norm_var": 0.006517537434895833, "learning_rate": 0.0001, "loss": 4.1495, "loss/crossentropy": 1.8752732872962952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17867545038461685, "step": 17208 }, { "epoch": 0.3442, "grad_norm": 1.828125, "grad_norm_var": 0.00972900390625, "learning_rate": 0.0001, "loss": 3.9998, "loss/crossentropy": 1.95436429977417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19947312772274017, "step": 17210 }, { "epoch": 0.34424, "grad_norm": 1.984375, "grad_norm_var": 0.00947265625, "learning_rate": 0.0001, "loss": 4.0074, "loss/crossentropy": 1.9237809777259827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055458500981331, "step": 17212 }, { "epoch": 0.34428, "grad_norm": 1.9375, "grad_norm_var": 0.007995351155598959, "learning_rate": 0.0001, "loss": 4.1351, "loss/crossentropy": 2.4497138261795044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23067011684179306, "step": 17214 }, { "epoch": 0.34432, "grad_norm": 1.859375, "grad_norm_var": 0.007791900634765625, "learning_rate": 0.0001, "loss": 3.9144, "loss/crossentropy": 2.33384370803833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22732949256896973, "step": 17216 }, { "epoch": 0.34436, "grad_norm": 2.0, "grad_norm_var": 0.0075439453125, "learning_rate": 0.0001, "loss": 3.9414, "loss/crossentropy": 2.022156059741974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063901051878929, "step": 17218 }, { "epoch": 0.3444, "grad_norm": 1.90625, "grad_norm_var": 0.007614898681640625, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 2.183963179588318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21098726987838745, "step": 17220 }, { "epoch": 0.34444, "grad_norm": 6.875, "grad_norm_var": 1.501227823893229, "learning_rate": 0.0001, "loss": 4.3139, "loss/crossentropy": 2.256633758544922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21449647843837738, "step": 17222 }, { "epoch": 0.34448, "grad_norm": 14.3125, "grad_norm_var": 10.478612263997396, "learning_rate": 0.0001, "loss": 3.838, "loss/crossentropy": 1.8303287625312805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18412896245718002, "step": 17224 }, { "epoch": 0.34452, "grad_norm": 2.0625, "grad_norm_var": 10.405972290039063, "learning_rate": 0.0001, "loss": 4.1852, "loss/crossentropy": 2.3248833417892456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23459070920944214, "step": 17226 }, { "epoch": 0.34456, "grad_norm": 2.109375, "grad_norm_var": 10.418485514322917, "learning_rate": 0.0001, "loss": 4.062, "loss/crossentropy": 2.018009066581726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21266046166419983, "step": 17228 }, { "epoch": 0.3446, "grad_norm": 1.984375, "grad_norm_var": 10.434609985351562, "learning_rate": 0.0001, "loss": 4.186, "loss/crossentropy": 2.337665557861328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.225304514169693, "step": 17230 }, { "epoch": 0.34464, "grad_norm": 1.8046875, "grad_norm_var": 10.453043365478516, "learning_rate": 0.0001, "loss": 3.86, "loss/crossentropy": 1.8581790924072266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18770471215248108, "step": 17232 }, { "epoch": 0.34468, "grad_norm": 1.8515625, "grad_norm_var": 10.462247721354167, "learning_rate": 0.0001, "loss": 3.9213, "loss/crossentropy": 1.9208187460899353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19106490910053253, "step": 17234 }, { "epoch": 0.34472, "grad_norm": 1.8515625, "grad_norm_var": 10.454630279541016, "learning_rate": 0.0001, "loss": 3.8284, "loss/crossentropy": 1.8844050765037537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17407642304897308, "step": 17236 }, { "epoch": 0.34476, "grad_norm": 1.9296875, "grad_norm_var": 9.497085571289062, "learning_rate": 0.0001, "loss": 3.7423, "loss/crossentropy": 2.0443355441093445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198170468211174, "step": 17238 }, { "epoch": 0.3448, "grad_norm": 2.046875, "grad_norm_var": 0.027342732747395834, "learning_rate": 0.0001, "loss": 4.0965, "loss/crossentropy": 1.8563520908355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958291083574295, "step": 17240 }, { "epoch": 0.34484, "grad_norm": 2.015625, "grad_norm_var": 0.0127349853515625, "learning_rate": 0.0001, "loss": 4.3028, "loss/crossentropy": 2.1252214908599854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22386425733566284, "step": 17242 }, { "epoch": 0.34488, "grad_norm": 1.9375, "grad_norm_var": 0.011848958333333333, "learning_rate": 0.0001, "loss": 3.926, "loss/crossentropy": 2.1379681825637817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19781935214996338, "step": 17244 }, { "epoch": 0.34492, "grad_norm": 1.828125, "grad_norm_var": 0.013529205322265625, "learning_rate": 0.0001, "loss": 3.7403, "loss/crossentropy": 1.9774981141090393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18523120880126953, "step": 17246 }, { "epoch": 0.34496, "grad_norm": 2.015625, "grad_norm_var": 0.0120025634765625, "learning_rate": 0.0001, "loss": 4.2668, "loss/crossentropy": 2.3234479427337646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21939057111740112, "step": 17248 }, { "epoch": 0.345, "grad_norm": 2.109375, "grad_norm_var": 0.011706288655598958, "learning_rate": 0.0001, "loss": 4.11, "loss/crossentropy": 2.0521583557128906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20204473286867142, "step": 17250 }, { "epoch": 0.34504, "grad_norm": 1.90625, "grad_norm_var": 0.03452860514322917, "learning_rate": 0.0001, "loss": 4.2184, "loss/crossentropy": 2.486477255821228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21673013269901276, "step": 17252 }, { "epoch": 0.34508, "grad_norm": 1.8515625, "grad_norm_var": 0.035456339518229164, "learning_rate": 0.0001, "loss": 4.0215, "loss/crossentropy": 2.285652995109558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21322200447320938, "step": 17254 }, { "epoch": 0.34512, "grad_norm": 2.046875, "grad_norm_var": 0.03436686197916667, "learning_rate": 0.0001, "loss": 4.1855, "loss/crossentropy": 2.0946747064590454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19791530817747116, "step": 17256 }, { "epoch": 0.34516, "grad_norm": 2.09375, "grad_norm_var": 0.03250732421875, "learning_rate": 0.0001, "loss": 4.1129, "loss/crossentropy": 1.7509565353393555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16754039376974106, "step": 17258 }, { "epoch": 0.3452, "grad_norm": 2.03125, "grad_norm_var": 0.0325347900390625, "learning_rate": 0.0001, "loss": 4.3411, "loss/crossentropy": 2.3828574419021606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22555553913116455, "step": 17260 }, { "epoch": 0.34524, "grad_norm": 2.140625, "grad_norm_var": 0.029060872395833333, "learning_rate": 0.0001, "loss": 4.215, "loss/crossentropy": 1.997941255569458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815095767378807, "step": 17262 }, { "epoch": 0.34528, "grad_norm": 2.25, "grad_norm_var": 0.0296630859375, "learning_rate": 0.0001, "loss": 4.3814, "loss/crossentropy": 2.3036177158355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2305641770362854, "step": 17264 }, { "epoch": 0.34532, "grad_norm": 1.8984375, "grad_norm_var": 0.03178075154622396, "learning_rate": 0.0001, "loss": 4.1668, "loss/crossentropy": 2.202543616294861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19594155251979828, "step": 17266 }, { "epoch": 0.34536, "grad_norm": 1.875, "grad_norm_var": 0.011199696858723959, "learning_rate": 0.0001, "loss": 4.1792, "loss/crossentropy": 2.1162038445472717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945410966873169, "step": 17268 }, { "epoch": 0.3454, "grad_norm": 1.90625, "grad_norm_var": 0.009883626302083334, "learning_rate": 0.0001, "loss": 4.1109, "loss/crossentropy": 2.273344039916992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24327096343040466, "step": 17270 }, { "epoch": 0.34544, "grad_norm": 1.9765625, "grad_norm_var": 0.011351521809895833, "learning_rate": 0.0001, "loss": 3.8254, "loss/crossentropy": 1.851362407207489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18494703620672226, "step": 17272 }, { "epoch": 0.34548, "grad_norm": 2.046875, "grad_norm_var": 0.01114501953125, "learning_rate": 0.0001, "loss": 4.0846, "loss/crossentropy": 1.9861729145050049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20077189058065414, "step": 17274 }, { "epoch": 0.34552, "grad_norm": 1.9765625, "grad_norm_var": 0.011716461181640625, "learning_rate": 0.0001, "loss": 4.0692, "loss/crossentropy": 2.0234099626541138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20506415516138077, "step": 17276 }, { "epoch": 0.34556, "grad_norm": 1.90625, "grad_norm_var": 0.012326812744140625, "learning_rate": 0.0001, "loss": 3.9843, "loss/crossentropy": 2.1666045784950256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18108145147562027, "step": 17278 }, { "epoch": 0.3456, "grad_norm": 1.8984375, "grad_norm_var": 0.0060536702473958336, "learning_rate": 0.0001, "loss": 4.0877, "loss/crossentropy": 2.066905975341797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112976461648941, "step": 17280 }, { "epoch": 0.34564, "grad_norm": 1.9140625, "grad_norm_var": 0.0059722900390625, "learning_rate": 0.0001, "loss": 4.224, "loss/crossentropy": 2.1789294481277466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20467212051153183, "step": 17282 }, { "epoch": 0.34568, "grad_norm": 1.953125, "grad_norm_var": 0.005736287434895833, "learning_rate": 0.0001, "loss": 3.9716, "loss/crossentropy": 1.8310245275497437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974945366382599, "step": 17284 }, { "epoch": 0.34572, "grad_norm": 1.9609375, "grad_norm_var": 0.006563059488932292, "learning_rate": 0.0001, "loss": 4.1832, "loss/crossentropy": 2.033496856689453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1989584118127823, "step": 17286 }, { "epoch": 0.34576, "grad_norm": 2.0625, "grad_norm_var": 0.0091705322265625, "learning_rate": 0.0001, "loss": 3.8748, "loss/crossentropy": 1.8883211016654968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889789029955864, "step": 17288 }, { "epoch": 0.3458, "grad_norm": 2.015625, "grad_norm_var": 0.008495076497395834, "learning_rate": 0.0001, "loss": 4.1637, "loss/crossentropy": 2.4520593881607056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2183564007282257, "step": 17290 }, { "epoch": 0.34584, "grad_norm": 2.046875, "grad_norm_var": 0.008272298177083333, "learning_rate": 0.0001, "loss": 3.9375, "loss/crossentropy": 2.030746340751648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19382008165121078, "step": 17292 }, { "epoch": 0.34588, "grad_norm": 1.9453125, "grad_norm_var": 0.007295735677083333, "learning_rate": 0.0001, "loss": 4.2182, "loss/crossentropy": 1.9897491931915283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21130450069904327, "step": 17294 }, { "epoch": 0.34592, "grad_norm": 1.9296875, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 3.982, "loss/crossentropy": 1.8000388145446777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20359987765550613, "step": 17296 }, { "epoch": 0.34596, "grad_norm": 1.984375, "grad_norm_var": 0.0065610249837239586, "learning_rate": 0.0001, "loss": 4.0341, "loss/crossentropy": 2.0408164262771606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18042601644992828, "step": 17298 }, { "epoch": 0.346, "grad_norm": 2.015625, "grad_norm_var": 0.006742350260416667, "learning_rate": 0.0001, "loss": 4.0849, "loss/crossentropy": 2.28415846824646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20799801498651505, "step": 17300 }, { "epoch": 0.34604, "grad_norm": 2.25, "grad_norm_var": 0.010925038655598959, "learning_rate": 0.0001, "loss": 4.3258, "loss/crossentropy": 1.9420422315597534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20019559562206268, "step": 17302 }, { "epoch": 0.34608, "grad_norm": 2.234375, "grad_norm_var": 0.0102691650390625, "learning_rate": 0.0001, "loss": 4.1892, "loss/crossentropy": 2.100687026977539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21129299700260162, "step": 17304 }, { "epoch": 0.34612, "grad_norm": 1.921875, "grad_norm_var": 0.010970052083333333, "learning_rate": 0.0001, "loss": 4.0209, "loss/crossentropy": 1.8657938241958618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081414759159088, "step": 17306 }, { "epoch": 0.34616, "grad_norm": 1.8359375, "grad_norm_var": 0.012853749593098958, "learning_rate": 0.0001, "loss": 4.025, "loss/crossentropy": 2.0629169940948486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117018163204193, "step": 17308 }, { "epoch": 0.3462, "grad_norm": 1.96875, "grad_norm_var": 0.0126129150390625, "learning_rate": 0.0001, "loss": 4.159, "loss/crossentropy": 2.2877084016799927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959577053785324, "step": 17310 }, { "epoch": 0.34624, "grad_norm": 2.015625, "grad_norm_var": 0.012280019124348958, "learning_rate": 0.0001, "loss": 4.1135, "loss/crossentropy": 2.2501026391983032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108026072382927, "step": 17312 }, { "epoch": 0.34628, "grad_norm": 2.0, "grad_norm_var": 0.012011464436848958, "learning_rate": 0.0001, "loss": 4.1498, "loss/crossentropy": 1.9485750794410706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126113697886467, "step": 17314 }, { "epoch": 0.34632, "grad_norm": 1.984375, "grad_norm_var": 0.011500803629557292, "learning_rate": 0.0001, "loss": 4.13, "loss/crossentropy": 2.193112373352051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19680321961641312, "step": 17316 }, { "epoch": 0.34636, "grad_norm": 1.796875, "grad_norm_var": 0.010084788004557291, "learning_rate": 0.0001, "loss": 4.1181, "loss/crossentropy": 2.30054771900177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211720421910286, "step": 17318 }, { "epoch": 0.3464, "grad_norm": 2.078125, "grad_norm_var": 0.026869455973307293, "learning_rate": 0.0001, "loss": 4.2737, "loss/crossentropy": 2.0091487169265747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19040243327617645, "step": 17320 }, { "epoch": 0.34644, "grad_norm": 2.0625, "grad_norm_var": 0.036628977457682295, "learning_rate": 0.0001, "loss": 4.2257, "loss/crossentropy": 2.1841423511505127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112671434879303, "step": 17322 }, { "epoch": 0.34648, "grad_norm": 2.171875, "grad_norm_var": 0.03555501302083333, "learning_rate": 0.0001, "loss": 3.8837, "loss/crossentropy": 2.092822253704071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037336230278015, "step": 17324 }, { "epoch": 0.34652, "grad_norm": 2.09375, "grad_norm_var": 0.0335601806640625, "learning_rate": 0.0001, "loss": 4.0322, "loss/crossentropy": 2.280580163002014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20753052085638046, "step": 17326 }, { "epoch": 0.34656, "grad_norm": 1.7890625, "grad_norm_var": 0.03884048461914062, "learning_rate": 0.0001, "loss": 4.1206, "loss/crossentropy": 1.60361909866333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15820877254009247, "step": 17328 }, { "epoch": 0.3466, "grad_norm": 1.84375, "grad_norm_var": 0.04915949503580729, "learning_rate": 0.0001, "loss": 3.6214, "loss/crossentropy": 1.608814001083374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16808264702558517, "step": 17330 }, { "epoch": 0.34664, "grad_norm": 2.609375, "grad_norm_var": 0.06932373046875, "learning_rate": 0.0001, "loss": 4.4983, "loss/crossentropy": 2.3868669271469116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23717772960662842, "step": 17332 }, { "epoch": 0.34668, "grad_norm": 1.8984375, "grad_norm_var": 0.06720759073893229, "learning_rate": 0.0001, "loss": 4.0862, "loss/crossentropy": 2.1908692717552185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21705647557973862, "step": 17334 }, { "epoch": 0.34672, "grad_norm": 2.140625, "grad_norm_var": 0.05272191365559896, "learning_rate": 0.0001, "loss": 4.2436, "loss/crossentropy": 2.459980010986328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24167515337467194, "step": 17336 }, { "epoch": 0.34676, "grad_norm": 1.984375, "grad_norm_var": 0.04353612263997396, "learning_rate": 0.0001, "loss": 4.219, "loss/crossentropy": 2.221674919128418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2097555324435234, "step": 17338 }, { "epoch": 0.3468, "grad_norm": 1.78125, "grad_norm_var": 0.04521458943684896, "learning_rate": 0.0001, "loss": 3.7565, "loss/crossentropy": 2.2840858697891235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2250964418053627, "step": 17340 }, { "epoch": 0.34684, "grad_norm": 1.9453125, "grad_norm_var": 0.04426854451497396, "learning_rate": 0.0001, "loss": 3.6803, "loss/crossentropy": 2.0715824365615845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149435505270958, "step": 17342 }, { "epoch": 0.34688, "grad_norm": 2.03125, "grad_norm_var": 0.0420806884765625, "learning_rate": 0.0001, "loss": 4.1017, "loss/crossentropy": 2.2389899492263794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22377658635377884, "step": 17344 }, { "epoch": 0.34692, "grad_norm": 2.046875, "grad_norm_var": 0.035278065999348955, "learning_rate": 0.0001, "loss": 3.9827, "loss/crossentropy": 2.0308732986450195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074751853942871, "step": 17346 }, { "epoch": 0.34696, "grad_norm": 1.890625, "grad_norm_var": 0.010864003499348959, "learning_rate": 0.0001, "loss": 3.7901, "loss/crossentropy": 1.8123770356178284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19080865383148193, "step": 17348 }, { "epoch": 0.347, "grad_norm": 1.734375, "grad_norm_var": 0.010367838541666667, "learning_rate": 0.0001, "loss": 3.8772, "loss/crossentropy": 1.884379267692566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799243837594986, "step": 17350 }, { "epoch": 0.34704, "grad_norm": 1.8671875, "grad_norm_var": 0.012442779541015626, "learning_rate": 0.0001, "loss": 4.4317, "loss/crossentropy": 2.237663149833679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20197343081235886, "step": 17352 }, { "epoch": 0.34708, "grad_norm": 1.828125, "grad_norm_var": 0.013337961832682292, "learning_rate": 0.0001, "loss": 3.7444, "loss/crossentropy": 1.9563414454460144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18722663074731827, "step": 17354 }, { "epoch": 0.34712, "grad_norm": 2.3125, "grad_norm_var": 0.020817057291666666, "learning_rate": 0.0001, "loss": 4.2799, "loss/crossentropy": 2.1188591718673706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21183937788009644, "step": 17356 }, { "epoch": 0.34716, "grad_norm": 2.046875, "grad_norm_var": 0.0219146728515625, "learning_rate": 0.0001, "loss": 4.4551, "loss/crossentropy": 2.276149272918701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232571393251419, "step": 17358 }, { "epoch": 0.3472, "grad_norm": 2.0, "grad_norm_var": 0.02123998006184896, "learning_rate": 0.0001, "loss": 4.4329, "loss/crossentropy": 2.360138177871704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22683367878198624, "step": 17360 }, { "epoch": 0.34724, "grad_norm": 1.96875, "grad_norm_var": 0.020576985677083333, "learning_rate": 0.0001, "loss": 4.2762, "loss/crossentropy": 1.99272620677948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18479204177856445, "step": 17362 }, { "epoch": 0.34728, "grad_norm": 1.9921875, "grad_norm_var": 0.019760894775390624, "learning_rate": 0.0001, "loss": 4.2303, "loss/crossentropy": 2.053771197795868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20591863989830017, "step": 17364 }, { "epoch": 0.34732, "grad_norm": 1.9296875, "grad_norm_var": 0.016428375244140626, "learning_rate": 0.0001, "loss": 3.9703, "loss/crossentropy": 2.1311055421829224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049081027507782, "step": 17366 }, { "epoch": 0.34736, "grad_norm": 1.984375, "grad_norm_var": 0.01246337890625, "learning_rate": 0.0001, "loss": 4.1102, "loss/crossentropy": 2.0712148547172546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19809392094612122, "step": 17368 }, { "epoch": 0.3474, "grad_norm": 1.953125, "grad_norm_var": 0.009468587239583333, "learning_rate": 0.0001, "loss": 4.0732, "loss/crossentropy": 2.144728183746338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964795887470245, "step": 17370 }, { "epoch": 0.34744, "grad_norm": 2.03125, "grad_norm_var": 0.003856404622395833, "learning_rate": 0.0001, "loss": 3.9145, "loss/crossentropy": 2.024275004863739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19414371997117996, "step": 17372 }, { "epoch": 0.34748, "grad_norm": 1.84375, "grad_norm_var": 0.00400390625, "learning_rate": 0.0001, "loss": 4.0925, "loss/crossentropy": 2.460733413696289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2185375839471817, "step": 17374 }, { "epoch": 0.34752, "grad_norm": 2.28125, "grad_norm_var": 0.010423787434895833, "learning_rate": 0.0001, "loss": 4.308, "loss/crossentropy": 2.3502203226089478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140796035528183, "step": 17376 }, { "epoch": 0.34756, "grad_norm": 1.78125, "grad_norm_var": 0.012547810872395834, "learning_rate": 0.0001, "loss": 4.0238, "loss/crossentropy": 2.2896264791488647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149006575345993, "step": 17378 }, { "epoch": 0.3476, "grad_norm": 1.8515625, "grad_norm_var": 0.013118489583333334, "learning_rate": 0.0001, "loss": 3.9757, "loss/crossentropy": 2.101209044456482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20842310786247253, "step": 17380 }, { "epoch": 0.34764, "grad_norm": 2.0, "grad_norm_var": 0.0189361572265625, "learning_rate": 0.0001, "loss": 4.3287, "loss/crossentropy": 1.8995551466941833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851196512579918, "step": 17382 }, { "epoch": 0.34768, "grad_norm": 1.9921875, "grad_norm_var": 0.01887995402018229, "learning_rate": 0.0001, "loss": 3.9219, "loss/crossentropy": 1.850643277168274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16962846368551254, "step": 17384 }, { "epoch": 0.34772, "grad_norm": 2.078125, "grad_norm_var": 0.019528961181640624, "learning_rate": 0.0001, "loss": 4.2478, "loss/crossentropy": 2.1131063103675842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19152897596359253, "step": 17386 }, { "epoch": 0.34776, "grad_norm": 1.9921875, "grad_norm_var": 0.020140584309895834, "learning_rate": 0.0001, "loss": 4.1932, "loss/crossentropy": 2.172673463821411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20645174384117126, "step": 17388 }, { "epoch": 0.3478, "grad_norm": 2.09375, "grad_norm_var": 0.019147745768229165, "learning_rate": 0.0001, "loss": 4.269, "loss/crossentropy": 2.431947708129883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20513125509023666, "step": 17390 }, { "epoch": 0.34784, "grad_norm": 1.9453125, "grad_norm_var": 0.014461008707682292, "learning_rate": 0.0001, "loss": 3.9314, "loss/crossentropy": 2.062328279018402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929447129368782, "step": 17392 }, { "epoch": 0.34788, "grad_norm": 2.015625, "grad_norm_var": 0.010609690348307292, "learning_rate": 0.0001, "loss": 4.2277, "loss/crossentropy": 2.189133882522583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20945163816213608, "step": 17394 }, { "epoch": 0.34792, "grad_norm": 1.9296875, "grad_norm_var": 0.010846964518229167, "learning_rate": 0.0001, "loss": 3.9426, "loss/crossentropy": 2.070523977279663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20401575416326523, "step": 17396 }, { "epoch": 0.34796, "grad_norm": 1.9375, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 4.0668, "loss/crossentropy": 2.3387876749038696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19960296899080276, "step": 17398 }, { "epoch": 0.348, "grad_norm": 2.078125, "grad_norm_var": 0.00955810546875, "learning_rate": 0.0001, "loss": 4.0225, "loss/crossentropy": 1.9857355952262878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19195467978715897, "step": 17400 }, { "epoch": 0.34804, "grad_norm": 1.9296875, "grad_norm_var": 0.009041086832682291, "learning_rate": 0.0001, "loss": 4.0715, "loss/crossentropy": 2.2523770332336426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22015457600355148, "step": 17402 }, { "epoch": 0.34808, "grad_norm": 1.875, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 3.9181, "loss/crossentropy": 1.7088012099266052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1644933521747589, "step": 17404 }, { "epoch": 0.34812, "grad_norm": 1.890625, "grad_norm_var": 0.006772613525390625, "learning_rate": 0.0001, "loss": 3.8759, "loss/crossentropy": 2.022001802921295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19237526506185532, "step": 17406 }, { "epoch": 0.34816, "grad_norm": 2.125, "grad_norm_var": 0.015669504801432293, "learning_rate": 0.0001, "loss": 4.2196, "loss/crossentropy": 2.240332841873169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2214636355638504, "step": 17408 }, { "epoch": 0.3482, "grad_norm": 1.9765625, "grad_norm_var": 0.015099843343098959, "learning_rate": 0.0001, "loss": 4.0211, "loss/crossentropy": 2.1392452716827393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21769221127033234, "step": 17410 }, { "epoch": 0.34824, "grad_norm": 1.890625, "grad_norm_var": 0.016078440348307292, "learning_rate": 0.0001, "loss": 4.0243, "loss/crossentropy": 1.9182460308074951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202327162027359, "step": 17412 }, { "epoch": 0.34828, "grad_norm": 2.0625, "grad_norm_var": 0.0153961181640625, "learning_rate": 0.0001, "loss": 4.0654, "loss/crossentropy": 2.036426305770874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19997400790452957, "step": 17414 }, { "epoch": 0.34832, "grad_norm": 1.9296875, "grad_norm_var": 0.014353179931640625, "learning_rate": 0.0001, "loss": 3.9086, "loss/crossentropy": 2.0429354906082153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20519915968179703, "step": 17416 }, { "epoch": 0.34836, "grad_norm": 2.03125, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 4.2742, "loss/crossentropy": 2.3943710327148438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106815129518509, "step": 17418 }, { "epoch": 0.3484, "grad_norm": 2.046875, "grad_norm_var": 0.0130615234375, "learning_rate": 0.0001, "loss": 4.1596, "loss/crossentropy": 2.060115098953247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20672088861465454, "step": 17420 }, { "epoch": 0.34844, "grad_norm": 1.8984375, "grad_norm_var": 0.012471516927083334, "learning_rate": 0.0001, "loss": 4.2223, "loss/crossentropy": 2.2820589542388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23306988179683685, "step": 17422 }, { "epoch": 0.34848, "grad_norm": 1.984375, "grad_norm_var": 0.008788045247395833, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 2.13969624042511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20909114927053452, "step": 17424 }, { "epoch": 0.34852, "grad_norm": 1.953125, "grad_norm_var": 0.011214192708333333, "learning_rate": 0.0001, "loss": 3.9572, "loss/crossentropy": 2.0822505950927734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20429539680480957, "step": 17426 }, { "epoch": 0.34856, "grad_norm": 1.9296875, "grad_norm_var": 0.009759267171223959, "learning_rate": 0.0001, "loss": 3.9435, "loss/crossentropy": 2.512156844139099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21776874363422394, "step": 17428 }, { "epoch": 0.3486, "grad_norm": 2.359375, "grad_norm_var": 0.018700917561848957, "learning_rate": 0.0001, "loss": 4.1145, "loss/crossentropy": 2.0379759669303894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20926732569932938, "step": 17430 }, { "epoch": 0.34864, "grad_norm": 2.046875, "grad_norm_var": 0.017295074462890626, "learning_rate": 0.0001, "loss": 4.1998, "loss/crossentropy": 2.443945050239563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22597475349903107, "step": 17432 }, { "epoch": 0.34868, "grad_norm": 1.8203125, "grad_norm_var": 0.019576009114583334, "learning_rate": 0.0001, "loss": 3.8442, "loss/crossentropy": 1.8295652866363525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18034228682518005, "step": 17434 }, { "epoch": 0.34872, "grad_norm": 1.8671875, "grad_norm_var": 0.020428212483723958, "learning_rate": 0.0001, "loss": 4.0614, "loss/crossentropy": 1.9487475156784058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18921684473752975, "step": 17436 }, { "epoch": 0.34876, "grad_norm": 2.078125, "grad_norm_var": 0.018721262613932293, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.099911689758301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079886943101883, "step": 17438 }, { "epoch": 0.3488, "grad_norm": 2.015625, "grad_norm_var": 0.016739654541015624, "learning_rate": 0.0001, "loss": 4.1762, "loss/crossentropy": 2.52177894115448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22080882638692856, "step": 17440 }, { "epoch": 0.34884, "grad_norm": 2.125, "grad_norm_var": 0.014900461832682291, "learning_rate": 0.0001, "loss": 4.1076, "loss/crossentropy": 2.222637891769409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22301562875509262, "step": 17442 }, { "epoch": 0.34888, "grad_norm": 2.09375, "grad_norm_var": 0.014725494384765624, "learning_rate": 0.0001, "loss": 4.3965, "loss/crossentropy": 2.2042930126190186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051277905702591, "step": 17444 }, { "epoch": 0.34892, "grad_norm": 2.015625, "grad_norm_var": 0.0065093994140625, "learning_rate": 0.0001, "loss": 4.2452, "loss/crossentropy": 2.0599029064178467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20651433616876602, "step": 17446 }, { "epoch": 0.34896, "grad_norm": 2.0, "grad_norm_var": 0.007027180989583334, "learning_rate": 0.0001, "loss": 3.9174, "loss/crossentropy": 1.9319151639938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18495241552591324, "step": 17448 }, { "epoch": 0.349, "grad_norm": 1.8671875, "grad_norm_var": 0.006197102864583333, "learning_rate": 0.0001, "loss": 4.0844, "loss/crossentropy": 2.341706871986389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20252344757318497, "step": 17450 }, { "epoch": 0.34904, "grad_norm": 1.953125, "grad_norm_var": 0.004833984375, "learning_rate": 0.0001, "loss": 4.0301, "loss/crossentropy": 2.0161609053611755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2176668643951416, "step": 17452 }, { "epoch": 0.34908, "grad_norm": 1.8359375, "grad_norm_var": 0.0062334696451822914, "learning_rate": 0.0001, "loss": 3.6046, "loss/crossentropy": 1.7961083054542542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16384318470954895, "step": 17454 }, { "epoch": 0.34912, "grad_norm": 2.265625, "grad_norm_var": 0.01791966756184896, "learning_rate": 0.0001, "loss": 4.6054, "loss/crossentropy": 2.108873188495636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137713640928268, "step": 17456 }, { "epoch": 0.34916, "grad_norm": 1.921875, "grad_norm_var": 0.017830403645833333, "learning_rate": 0.0001, "loss": 3.9543, "loss/crossentropy": 2.158664584159851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19957562536001205, "step": 17458 }, { "epoch": 0.3492, "grad_norm": 1.84375, "grad_norm_var": 0.018660227457682293, "learning_rate": 0.0001, "loss": 3.9454, "loss/crossentropy": 1.7799381017684937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193112812936306, "step": 17460 }, { "epoch": 0.34924, "grad_norm": 2.0625, "grad_norm_var": 0.019128163655598957, "learning_rate": 0.0001, "loss": 4.4072, "loss/crossentropy": 2.372501015663147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180323675274849, "step": 17462 }, { "epoch": 0.34928, "grad_norm": 1.96875, "grad_norm_var": 0.0185699462890625, "learning_rate": 0.0001, "loss": 4.0343, "loss/crossentropy": 1.9338968396186829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19117196649312973, "step": 17464 }, { "epoch": 0.34932, "grad_norm": 1.8125, "grad_norm_var": 0.020774078369140626, "learning_rate": 0.0001, "loss": 3.6429, "loss/crossentropy": 1.4602742195129395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1677519753575325, "step": 17466 }, { "epoch": 0.34936, "grad_norm": 2.0, "grad_norm_var": 0.02080078125, "learning_rate": 0.0001, "loss": 4.2498, "loss/crossentropy": 2.1464006304740906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21201838552951813, "step": 17468 }, { "epoch": 0.3494, "grad_norm": 2.109375, "grad_norm_var": 0.018529256184895832, "learning_rate": 0.0001, "loss": 4.1736, "loss/crossentropy": 2.071315884590149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19322700798511505, "step": 17470 }, { "epoch": 0.34944, "grad_norm": 1.984375, "grad_norm_var": 0.010309855143229166, "learning_rate": 0.0001, "loss": 4.2653, "loss/crossentropy": 1.9166500568389893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23960981518030167, "step": 17472 }, { "epoch": 0.34948, "grad_norm": 2.0625, "grad_norm_var": 0.011457316080729167, "learning_rate": 0.0001, "loss": 3.9433, "loss/crossentropy": 1.8666390180587769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875123456120491, "step": 17474 }, { "epoch": 0.34952, "grad_norm": 2.015625, "grad_norm_var": 0.009736887613932292, "learning_rate": 0.0001, "loss": 4.0984, "loss/crossentropy": 1.9197289943695068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21075090020895004, "step": 17476 }, { "epoch": 0.34956, "grad_norm": 1.90625, "grad_norm_var": 0.011018625895182292, "learning_rate": 0.0001, "loss": 4.0971, "loss/crossentropy": 1.904780387878418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20412997901439667, "step": 17478 }, { "epoch": 0.3496, "grad_norm": 1.859375, "grad_norm_var": 0.012325032552083334, "learning_rate": 0.0001, "loss": 3.735, "loss/crossentropy": 1.6775096654891968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1669272631406784, "step": 17480 }, { "epoch": 0.34964, "grad_norm": 2.015625, "grad_norm_var": 0.0089111328125, "learning_rate": 0.0001, "loss": 4.1039, "loss/crossentropy": 2.050463318824768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19382070004940033, "step": 17482 }, { "epoch": 0.34968, "grad_norm": 2.015625, "grad_norm_var": 0.009224192301432291, "learning_rate": 0.0001, "loss": 3.8739, "loss/crossentropy": 1.8124673962593079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20674846321344376, "step": 17484 }, { "epoch": 0.34972, "grad_norm": 1.8984375, "grad_norm_var": 0.008969879150390625, "learning_rate": 0.0001, "loss": 3.6797, "loss/crossentropy": 1.6487451791763306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17874807119369507, "step": 17486 }, { "epoch": 0.34976, "grad_norm": 2.171875, "grad_norm_var": 0.009430948893229167, "learning_rate": 0.0001, "loss": 4.4025, "loss/crossentropy": 2.101687431335449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24856076389551163, "step": 17488 }, { "epoch": 0.3498, "grad_norm": 2.015625, "grad_norm_var": 0.009205881754557292, "learning_rate": 0.0001, "loss": 4.0236, "loss/crossentropy": 1.9233632683753967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21046236902475357, "step": 17490 }, { "epoch": 0.34984, "grad_norm": 2.09375, "grad_norm_var": 0.010589345296223959, "learning_rate": 0.0001, "loss": 4.0376, "loss/crossentropy": 2.2882679104804993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23568424582481384, "step": 17492 }, { "epoch": 0.34988, "grad_norm": 2.0625, "grad_norm_var": 0.022342681884765625, "learning_rate": 0.0001, "loss": 4.4205, "loss/crossentropy": 2.205874502658844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21892941743135452, "step": 17494 }, { "epoch": 0.34992, "grad_norm": 2.234375, "grad_norm_var": 0.0232177734375, "learning_rate": 0.0001, "loss": 4.3628, "loss/crossentropy": 2.278464913368225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22409500926733017, "step": 17496 }, { "epoch": 0.34996, "grad_norm": 1.859375, "grad_norm_var": 0.026082356770833332, "learning_rate": 0.0001, "loss": 4.0736, "loss/crossentropy": 2.5003273487091064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22478827834129333, "step": 17498 }, { "epoch": 0.35, "grad_norm": 2.0625, "grad_norm_var": 0.04237848917643229, "learning_rate": 0.0001, "loss": 4.3658, "loss/crossentropy": 2.034249722957611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19176610559225082, "step": 17500 }, { "epoch": 0.35004, "grad_norm": 1.9921875, "grad_norm_var": 0.0401031494140625, "learning_rate": 0.0001, "loss": 4.1586, "loss/crossentropy": 2.271065592765808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21523310244083405, "step": 17502 }, { "epoch": 0.35008, "grad_norm": 1.984375, "grad_norm_var": 0.03870824178059896, "learning_rate": 0.0001, "loss": 4.1198, "loss/crossentropy": 2.182866334915161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21311558783054352, "step": 17504 }, { "epoch": 0.35012, "grad_norm": 2.125, "grad_norm_var": 0.03878173828125, "learning_rate": 0.0001, "loss": 4.1471, "loss/crossentropy": 1.7134016752243042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16884544491767883, "step": 17506 }, { "epoch": 0.35016, "grad_norm": 2.125, "grad_norm_var": 0.03687718709309896, "learning_rate": 0.0001, "loss": 4.2464, "loss/crossentropy": 2.198198080062866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916792169213295, "step": 17508 }, { "epoch": 0.3502, "grad_norm": 2.0625, "grad_norm_var": 0.0272125244140625, "learning_rate": 0.0001, "loss": 4.2651, "loss/crossentropy": 1.9370547533035278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23538394272327423, "step": 17510 }, { "epoch": 0.35024, "grad_norm": 2.078125, "grad_norm_var": 0.02469456990559896, "learning_rate": 0.0001, "loss": 4.0664, "loss/crossentropy": 2.1743874549865723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22514399886131287, "step": 17512 }, { "epoch": 0.35028, "grad_norm": 2.03125, "grad_norm_var": 0.02174657185872396, "learning_rate": 0.0001, "loss": 4.2767, "loss/crossentropy": 1.8546866178512573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772822067141533, "step": 17514 }, { "epoch": 0.35032, "grad_norm": 2.0, "grad_norm_var": 0.004154205322265625, "learning_rate": 0.0001, "loss": 4.2603, "loss/crossentropy": 1.8439211249351501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19603434205055237, "step": 17516 }, { "epoch": 0.35036, "grad_norm": 1.8125, "grad_norm_var": 0.006843058268229166, "learning_rate": 0.0001, "loss": 3.9506, "loss/crossentropy": 2.2717851400375366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21598497033119202, "step": 17518 }, { "epoch": 0.3504, "grad_norm": 1.921875, "grad_norm_var": 0.006843058268229166, "learning_rate": 0.0001, "loss": 4.1807, "loss/crossentropy": 2.2645692825317383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229163572192192, "step": 17520 }, { "epoch": 0.35044, "grad_norm": 1.828125, "grad_norm_var": 0.0071604410807291664, "learning_rate": 0.0001, "loss": 4.09, "loss/crossentropy": 2.0507450103759766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19268949329853058, "step": 17522 }, { "epoch": 0.35048, "grad_norm": 2.0, "grad_norm_var": 0.005521647135416667, "learning_rate": 0.0001, "loss": 4.1468, "loss/crossentropy": 2.3831188678741455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21931231766939163, "step": 17524 }, { "epoch": 0.35052, "grad_norm": 1.9765625, "grad_norm_var": 0.006281534830729167, "learning_rate": 0.0001, "loss": 4.116, "loss/crossentropy": 1.8574120998382568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17972451448440552, "step": 17526 }, { "epoch": 0.35056, "grad_norm": 1.875, "grad_norm_var": 0.006013997395833333, "learning_rate": 0.0001, "loss": 3.9063, "loss/crossentropy": 2.141213893890381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21798025816679, "step": 17528 }, { "epoch": 0.3506, "grad_norm": 1.8359375, "grad_norm_var": 0.006520334879557292, "learning_rate": 0.0001, "loss": 3.9454, "loss/crossentropy": 2.2224762439727783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2031020149588585, "step": 17530 }, { "epoch": 0.35064, "grad_norm": 1.9140625, "grad_norm_var": 0.006476847330729166, "learning_rate": 0.0001, "loss": 3.765, "loss/crossentropy": 2.163232743740082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22002054750919342, "step": 17532 }, { "epoch": 0.35068, "grad_norm": 1.9140625, "grad_norm_var": 0.0055653889973958336, "learning_rate": 0.0001, "loss": 4.1397, "loss/crossentropy": 2.1463611721992493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20084905624389648, "step": 17534 }, { "epoch": 0.35072, "grad_norm": 2.015625, "grad_norm_var": 0.006566365559895833, "learning_rate": 0.0001, "loss": 3.9751, "loss/crossentropy": 2.1863406896591187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150622010231018, "step": 17536 }, { "epoch": 0.35076, "grad_norm": 1.9609375, "grad_norm_var": 0.005744425455729166, "learning_rate": 0.0001, "loss": 4.0958, "loss/crossentropy": 2.2162610292434692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20196689665317535, "step": 17538 }, { "epoch": 0.3508, "grad_norm": 10.6875, "grad_norm_var": 4.776464589436849, "learning_rate": 0.0001, "loss": 4.1004, "loss/crossentropy": 1.8908233642578125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23238417506217957, "step": 17540 }, { "epoch": 0.35084, "grad_norm": 2.125, "grad_norm_var": 4.763142903645833, "learning_rate": 0.0001, "loss": 4.416, "loss/crossentropy": 2.3578076362609863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21626722812652588, "step": 17542 }, { "epoch": 0.35088, "grad_norm": 2.234375, "grad_norm_var": 4.736722819010416, "learning_rate": 0.0001, "loss": 4.3243, "loss/crossentropy": 2.4079915285110474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2292388305068016, "step": 17544 }, { "epoch": 0.35092, "grad_norm": 2.015625, "grad_norm_var": 4.717746734619141, "learning_rate": 0.0001, "loss": 4.2647, "loss/crossentropy": 2.1021666526794434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20725515484809875, "step": 17546 }, { "epoch": 0.35096, "grad_norm": 1.96875, "grad_norm_var": 4.707061513264974, "learning_rate": 0.0001, "loss": 4.0905, "loss/crossentropy": 2.039289176464081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614883303642273, "step": 17548 }, { "epoch": 0.351, "grad_norm": 2.203125, "grad_norm_var": 4.68468017578125, "learning_rate": 0.0001, "loss": 4.269, "loss/crossentropy": 1.9731826782226562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2164870798587799, "step": 17550 }, { "epoch": 0.35104, "grad_norm": 1.875, "grad_norm_var": 4.69991455078125, "learning_rate": 0.0001, "loss": 4.0785, "loss/crossentropy": 2.0166266560554504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977868676185608, "step": 17552 }, { "epoch": 0.35108, "grad_norm": 2.0, "grad_norm_var": 4.692333730061849, "learning_rate": 0.0001, "loss": 4.2823, "loss/crossentropy": 2.284690737724304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2204381301999092, "step": 17554 }, { "epoch": 0.35112, "grad_norm": 1.8828125, "grad_norm_var": 0.012977854410807291, "learning_rate": 0.0001, "loss": 4.2629, "loss/crossentropy": 2.293992757797241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21718238294124603, "step": 17556 }, { "epoch": 0.35116, "grad_norm": 2.140625, "grad_norm_var": 0.013133748372395834, "learning_rate": 0.0001, "loss": 4.0687, "loss/crossentropy": 2.252563714981079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23132775723934174, "step": 17558 }, { "epoch": 0.3512, "grad_norm": 1.921875, "grad_norm_var": 0.0097564697265625, "learning_rate": 0.0001, "loss": 3.977, "loss/crossentropy": 1.8454533219337463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20084110647439957, "step": 17560 }, { "epoch": 0.35124, "grad_norm": 2.0625, "grad_norm_var": 0.009616851806640625, "learning_rate": 0.0001, "loss": 4.1526, "loss/crossentropy": 2.3160746097564697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21168464422225952, "step": 17562 }, { "epoch": 0.35128, "grad_norm": 2.03125, "grad_norm_var": 0.00953369140625, "learning_rate": 0.0001, "loss": 4.031, "loss/crossentropy": 2.0470253229141235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728407055139542, "step": 17564 }, { "epoch": 0.35132, "grad_norm": 2.0, "grad_norm_var": 0.0064389546712239586, "learning_rate": 0.0001, "loss": 4.404, "loss/crossentropy": 2.098679304122925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133883535861969, "step": 17566 }, { "epoch": 0.35136, "grad_norm": 1.9765625, "grad_norm_var": 0.005615234375, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 1.975411057472229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19393378496170044, "step": 17568 }, { "epoch": 0.3514, "grad_norm": 2.0625, "grad_norm_var": 0.005606842041015625, "learning_rate": 0.0001, "loss": 4.2861, "loss/crossentropy": 2.594779372215271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2499995082616806, "step": 17570 }, { "epoch": 0.35144, "grad_norm": 1.875, "grad_norm_var": 0.005915323893229167, "learning_rate": 0.0001, "loss": 4.1045, "loss/crossentropy": 2.075734496116638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207140251994133, "step": 17572 }, { "epoch": 0.35148, "grad_norm": 1.765625, "grad_norm_var": 0.0062978108723958336, "learning_rate": 0.0001, "loss": 3.869, "loss/crossentropy": 2.124872624874115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20058108121156693, "step": 17574 }, { "epoch": 0.35152, "grad_norm": 2.0625, "grad_norm_var": 0.012056477864583333, "learning_rate": 0.0001, "loss": 4.1552, "loss/crossentropy": 2.0275574922561646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2105395644903183, "step": 17576 }, { "epoch": 0.35156, "grad_norm": 1.9921875, "grad_norm_var": 0.011913045247395834, "learning_rate": 0.0001, "loss": 4.248, "loss/crossentropy": 2.0694713592529297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21342340111732483, "step": 17578 }, { "epoch": 0.3516, "grad_norm": 2.0625, "grad_norm_var": 0.012239583333333333, "learning_rate": 0.0001, "loss": 3.9729, "loss/crossentropy": 1.8013625741004944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766640990972519, "step": 17580 }, { "epoch": 0.35164, "grad_norm": 2.015625, "grad_norm_var": 0.0123443603515625, "learning_rate": 0.0001, "loss": 4.0313, "loss/crossentropy": 2.1990463733673096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19608185440301895, "step": 17582 }, { "epoch": 0.35168, "grad_norm": 1.8671875, "grad_norm_var": 0.013361612955729166, "learning_rate": 0.0001, "loss": 3.9041, "loss/crossentropy": 2.0714540481567383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20064443349838257, "step": 17584 }, { "epoch": 0.35172, "grad_norm": 1.9453125, "grad_norm_var": 0.015242258707682291, "learning_rate": 0.0001, "loss": 3.7431, "loss/crossentropy": 1.7437097430229187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17001167684793472, "step": 17586 }, { "epoch": 0.35176, "grad_norm": 2.265625, "grad_norm_var": 0.020531972249348957, "learning_rate": 0.0001, "loss": 4.1987, "loss/crossentropy": 2.072261691093445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21019883453845978, "step": 17588 }, { "epoch": 0.3518, "grad_norm": 2.28125, "grad_norm_var": 0.021922810872395834, "learning_rate": 0.0001, "loss": 4.0826, "loss/crossentropy": 2.1804715394973755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21589763462543488, "step": 17590 }, { "epoch": 0.35184, "grad_norm": 2.0, "grad_norm_var": 0.01734619140625, "learning_rate": 0.0001, "loss": 4.0964, "loss/crossentropy": 2.0044930577278137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20204917341470718, "step": 17592 }, { "epoch": 0.35188, "grad_norm": 2.015625, "grad_norm_var": 0.017362467447916665, "learning_rate": 0.0001, "loss": 4.1394, "loss/crossentropy": 1.8961025476455688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1865018978714943, "step": 17594 }, { "epoch": 0.35192, "grad_norm": 1.9765625, "grad_norm_var": 0.016971588134765625, "learning_rate": 0.0001, "loss": 3.8503, "loss/crossentropy": 1.7005563378334045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18951866030693054, "step": 17596 }, { "epoch": 0.35196, "grad_norm": 1.953125, "grad_norm_var": 0.017002105712890625, "learning_rate": 0.0001, "loss": 4.0044, "loss/crossentropy": 2.002982437610626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076285257935524, "step": 17598 }, { "epoch": 0.352, "grad_norm": 1.90625, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 4.0595, "loss/crossentropy": 2.0283620357513428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20175684988498688, "step": 17600 }, { "epoch": 0.35204, "grad_norm": 1.9375, "grad_norm_var": 0.014289347330729167, "learning_rate": 0.0001, "loss": 3.9495, "loss/crossentropy": 1.855184018611908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19368328154087067, "step": 17602 }, { "epoch": 0.35208, "grad_norm": 1.96875, "grad_norm_var": 0.009089152018229166, "learning_rate": 0.0001, "loss": 4.2364, "loss/crossentropy": 2.2288308143615723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121831700205803, "step": 17604 }, { "epoch": 0.35212, "grad_norm": 1.953125, "grad_norm_var": 0.002512359619140625, "learning_rate": 0.0001, "loss": 4.1398, "loss/crossentropy": 1.9114368557929993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17937570065259933, "step": 17606 }, { "epoch": 0.35216, "grad_norm": 1.921875, "grad_norm_var": 0.0026751200358072916, "learning_rate": 0.0001, "loss": 3.9921, "loss/crossentropy": 2.238133430480957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23264692723751068, "step": 17608 }, { "epoch": 0.3522, "grad_norm": 1.9375, "grad_norm_var": 0.0024169921875, "learning_rate": 0.0001, "loss": 3.85, "loss/crossentropy": 1.7219743728637695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17108283936977386, "step": 17610 }, { "epoch": 0.35224, "grad_norm": 1.890625, "grad_norm_var": 0.005037434895833333, "learning_rate": 0.0001, "loss": 3.8406, "loss/crossentropy": 1.9613978862762451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042389214038849, "step": 17612 }, { "epoch": 0.35228, "grad_norm": 1.859375, "grad_norm_var": 0.005132802327473958, "learning_rate": 0.0001, "loss": 3.8567, "loss/crossentropy": 1.946107029914856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19604258984327316, "step": 17614 }, { "epoch": 0.35232, "grad_norm": 1.921875, "grad_norm_var": 0.005077870686848959, "learning_rate": 0.0001, "loss": 4.1404, "loss/crossentropy": 2.255744218826294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2184707298874855, "step": 17616 }, { "epoch": 0.35236, "grad_norm": 2.0, "grad_norm_var": 0.0058062235514322914, "learning_rate": 0.0001, "loss": 4.1852, "loss/crossentropy": 2.339258551597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21353042870759964, "step": 17618 }, { "epoch": 0.3524, "grad_norm": 1.984375, "grad_norm_var": 0.0054595947265625, "learning_rate": 0.0001, "loss": 3.9494, "loss/crossentropy": 1.8912869691848755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19134657084941864, "step": 17620 }, { "epoch": 0.35244, "grad_norm": 2.03125, "grad_norm_var": 0.0058258056640625, "learning_rate": 0.0001, "loss": 4.2607, "loss/crossentropy": 2.186914384365082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20765449851751328, "step": 17622 }, { "epoch": 0.35248, "grad_norm": 1.8828125, "grad_norm_var": 0.005773671468098958, "learning_rate": 0.0001, "loss": 3.8567, "loss/crossentropy": 1.6339558959007263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18255474418401718, "step": 17624 }, { "epoch": 0.35252, "grad_norm": 1.9296875, "grad_norm_var": 0.005796051025390625, "learning_rate": 0.0001, "loss": 3.9593, "loss/crossentropy": 2.030660629272461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1943136677145958, "step": 17626 }, { "epoch": 0.35256, "grad_norm": 2.046875, "grad_norm_var": 0.004173787434895834, "learning_rate": 0.0001, "loss": 3.9738, "loss/crossentropy": 1.8040945529937744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18390020728111267, "step": 17628 }, { "epoch": 0.3526, "grad_norm": 2.15625, "grad_norm_var": 0.0055539449055989586, "learning_rate": 0.0001, "loss": 4.4613, "loss/crossentropy": 1.9993014335632324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20794235169887543, "step": 17630 }, { "epoch": 0.35264, "grad_norm": 1.9609375, "grad_norm_var": 0.004881795247395833, "learning_rate": 0.0001, "loss": 3.8539, "loss/crossentropy": 2.0751482248306274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20385687798261642, "step": 17632 }, { "epoch": 0.35268, "grad_norm": 2.078125, "grad_norm_var": 0.0052073160807291664, "learning_rate": 0.0001, "loss": 4.0059, "loss/crossentropy": 1.8923512697219849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879557192325592, "step": 17634 }, { "epoch": 0.35272, "grad_norm": 1.984375, "grad_norm_var": 0.004935709635416666, "learning_rate": 0.0001, "loss": 4.2176, "loss/crossentropy": 2.068901300430298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21545641124248505, "step": 17636 }, { "epoch": 0.35276, "grad_norm": 2.046875, "grad_norm_var": 0.005968983968098958, "learning_rate": 0.0001, "loss": 3.8627, "loss/crossentropy": 1.8329379558563232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875568851828575, "step": 17638 }, { "epoch": 0.3528, "grad_norm": 1.90625, "grad_norm_var": 0.007811482747395833, "learning_rate": 0.0001, "loss": 4.0878, "loss/crossentropy": 2.2368232011795044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22392207384109497, "step": 17640 }, { "epoch": 0.35284, "grad_norm": 2.3125, "grad_norm_var": 0.014972941080729166, "learning_rate": 0.0001, "loss": 4.0397, "loss/crossentropy": 2.2503843307495117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21132144331932068, "step": 17642 }, { "epoch": 0.35288, "grad_norm": 1.9140625, "grad_norm_var": 0.01582819620768229, "learning_rate": 0.0001, "loss": 4.0459, "loss/crossentropy": 2.1919764280319214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197752483189106, "step": 17644 }, { "epoch": 0.35292, "grad_norm": 1.9921875, "grad_norm_var": 0.0148345947265625, "learning_rate": 0.0001, "loss": 3.9122, "loss/crossentropy": 1.9867302775382996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115078568458557, "step": 17646 }, { "epoch": 0.35296, "grad_norm": 1.90625, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 3.8369, "loss/crossentropy": 1.9015426635742188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17188573628664017, "step": 17648 }, { "epoch": 0.353, "grad_norm": 1.875, "grad_norm_var": 0.019006093343098957, "learning_rate": 0.0001, "loss": 3.6023, "loss/crossentropy": 1.8092533946037292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1764475554227829, "step": 17650 }, { "epoch": 0.35304, "grad_norm": 1.8828125, "grad_norm_var": 0.01936620076497396, "learning_rate": 0.0001, "loss": 4.0542, "loss/crossentropy": 2.0736488103866577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20046666264533997, "step": 17652 }, { "epoch": 0.35308, "grad_norm": 2.078125, "grad_norm_var": 0.019364166259765624, "learning_rate": 0.0001, "loss": 3.9716, "loss/crossentropy": 2.1787428855895996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23303276300430298, "step": 17654 }, { "epoch": 0.35312, "grad_norm": 2.0, "grad_norm_var": 0.017020416259765626, "learning_rate": 0.0001, "loss": 4.4889, "loss/crossentropy": 2.419381618499756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23756036162376404, "step": 17656 }, { "epoch": 0.35316, "grad_norm": 2.09375, "grad_norm_var": 0.012111155192057292, "learning_rate": 0.0001, "loss": 4.4202, "loss/crossentropy": 2.4017220735549927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2373708263039589, "step": 17658 }, { "epoch": 0.3532, "grad_norm": 1.9921875, "grad_norm_var": 0.011919911702473958, "learning_rate": 0.0001, "loss": 3.8805, "loss/crossentropy": 1.7796767354011536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19661501049995422, "step": 17660 }, { "epoch": 0.35324, "grad_norm": 2.015625, "grad_norm_var": 0.013158162434895834, "learning_rate": 0.0001, "loss": 4.2215, "loss/crossentropy": 2.1815608739852905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20645780861377716, "step": 17662 }, { "epoch": 0.35328, "grad_norm": 2.03125, "grad_norm_var": 0.010178375244140624, "learning_rate": 0.0001, "loss": 4.2148, "loss/crossentropy": 2.056411921977997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21581003069877625, "step": 17664 }, { "epoch": 0.35332, "grad_norm": 1.8984375, "grad_norm_var": 0.007824452718098958, "learning_rate": 0.0001, "loss": 3.8676, "loss/crossentropy": 1.8901747465133667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19498124718666077, "step": 17666 }, { "epoch": 0.35336, "grad_norm": 2.0, "grad_norm_var": 0.006493123372395834, "learning_rate": 0.0001, "loss": 4.0884, "loss/crossentropy": 2.0241716504096985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19405542314052582, "step": 17668 }, { "epoch": 0.3534, "grad_norm": 1.96875, "grad_norm_var": 0.008241526285807292, "learning_rate": 0.0001, "loss": 4.1452, "loss/crossentropy": 2.016151189804077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1856973022222519, "step": 17670 }, { "epoch": 0.35344, "grad_norm": 2.078125, "grad_norm_var": 0.009096018473307292, "learning_rate": 0.0001, "loss": 4.3895, "loss/crossentropy": 2.5066399574279785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23949292302131653, "step": 17672 }, { "epoch": 0.35348, "grad_norm": 2.015625, "grad_norm_var": 0.007236480712890625, "learning_rate": 0.0001, "loss": 4.1432, "loss/crossentropy": 2.1384177207946777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21078043431043625, "step": 17674 }, { "epoch": 0.35352, "grad_norm": 1.8984375, "grad_norm_var": 0.0070709228515625, "learning_rate": 0.0001, "loss": 4.1732, "loss/crossentropy": 2.1211976408958435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064737230539322, "step": 17676 }, { "epoch": 0.35356, "grad_norm": 2.078125, "grad_norm_var": 0.0072418212890625, "learning_rate": 0.0001, "loss": 4.3103, "loss/crossentropy": 2.018574059009552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20446718484163284, "step": 17678 }, { "epoch": 0.3536, "grad_norm": 2.015625, "grad_norm_var": 0.008186848958333333, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 2.10149747133255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974436193704605, "step": 17680 }, { "epoch": 0.35364, "grad_norm": 2.0625, "grad_norm_var": 0.009698232014973959, "learning_rate": 0.0001, "loss": 4.3133, "loss/crossentropy": 2.288322687149048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22535154223442078, "step": 17682 }, { "epoch": 0.35368, "grad_norm": 1.921875, "grad_norm_var": 0.010400136311848959, "learning_rate": 0.0001, "loss": 4.3396, "loss/crossentropy": 2.002479314804077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20964795351028442, "step": 17684 }, { "epoch": 0.35372, "grad_norm": 1.8984375, "grad_norm_var": 0.0099029541015625, "learning_rate": 0.0001, "loss": 4.0116, "loss/crossentropy": 2.2548930644989014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19946102052927017, "step": 17686 }, { "epoch": 0.35376, "grad_norm": 2.296875, "grad_norm_var": 0.015006510416666667, "learning_rate": 0.0001, "loss": 4.1777, "loss/crossentropy": 2.3636194467544556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21626830101013184, "step": 17688 }, { "epoch": 0.3538, "grad_norm": 2.15625, "grad_norm_var": 0.016471099853515626, "learning_rate": 0.0001, "loss": 4.1137, "loss/crossentropy": 2.2401771545410156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202116459608078, "step": 17690 }, { "epoch": 0.35384, "grad_norm": 1.9453125, "grad_norm_var": 0.01602783203125, "learning_rate": 0.0001, "loss": 4.2248, "loss/crossentropy": 2.192560911178589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20161370187997818, "step": 17692 }, { "epoch": 0.35388, "grad_norm": 2.078125, "grad_norm_var": 0.01495361328125, "learning_rate": 0.0001, "loss": 4.175, "loss/crossentropy": 2.31100332736969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23697054386138916, "step": 17694 }, { "epoch": 0.35392, "grad_norm": 2.171875, "grad_norm_var": 0.014139811197916666, "learning_rate": 0.0001, "loss": 4.0894, "loss/crossentropy": 2.018012821674347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954345926642418, "step": 17696 }, { "epoch": 0.35396, "grad_norm": 2.203125, "grad_norm_var": 0.0168853759765625, "learning_rate": 0.0001, "loss": 4.3128, "loss/crossentropy": 2.0922133326530457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2219238504767418, "step": 17698 }, { "epoch": 0.354, "grad_norm": 2.03125, "grad_norm_var": 0.01591796875, "learning_rate": 0.0001, "loss": 4.3803, "loss/crossentropy": 2.727385640144348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21042727679014206, "step": 17700 }, { "epoch": 0.35404, "grad_norm": 1.8984375, "grad_norm_var": 0.014631144205729167, "learning_rate": 0.0001, "loss": 3.9535, "loss/crossentropy": 1.8388070464134216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18624315410852432, "step": 17702 }, { "epoch": 0.35408, "grad_norm": 2.03125, "grad_norm_var": 0.008719889322916667, "learning_rate": 0.0001, "loss": 3.9327, "loss/crossentropy": 1.879045307636261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827290803194046, "step": 17704 }, { "epoch": 0.35412, "grad_norm": 1.9921875, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 4.2715, "loss/crossentropy": 2.2604658603668213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22230257838964462, "step": 17706 }, { "epoch": 0.35416, "grad_norm": 1.7890625, "grad_norm_var": 0.01329345703125, "learning_rate": 0.0001, "loss": 3.7828, "loss/crossentropy": 2.1200287342071533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20524511486291885, "step": 17708 }, { "epoch": 0.3542, "grad_norm": 1.96875, "grad_norm_var": 0.014135487874348958, "learning_rate": 0.0001, "loss": 3.9624, "loss/crossentropy": 1.933307945728302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19991742819547653, "step": 17710 }, { "epoch": 0.35424, "grad_norm": 2.015625, "grad_norm_var": 0.013231404622395833, "learning_rate": 0.0001, "loss": 4.0673, "loss/crossentropy": 1.9442221522331238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861218512058258, "step": 17712 }, { "epoch": 0.35428, "grad_norm": 1.90625, "grad_norm_var": 0.008885701497395834, "learning_rate": 0.0001, "loss": 3.6592, "loss/crossentropy": 1.7169482111930847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16892607510089874, "step": 17714 }, { "epoch": 0.35432, "grad_norm": 1.9921875, "grad_norm_var": 0.006891886393229167, "learning_rate": 0.0001, "loss": 4.3472, "loss/crossentropy": 2.105340003967285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19026386737823486, "step": 17716 }, { "epoch": 0.35436, "grad_norm": 1.90625, "grad_norm_var": 0.0069976806640625, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 2.0366504192352295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19709386676549911, "step": 17718 }, { "epoch": 0.3544, "grad_norm": 2.078125, "grad_norm_var": 0.007393391927083334, "learning_rate": 0.0001, "loss": 3.9829, "loss/crossentropy": 1.9578897356987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19842654466629028, "step": 17720 }, { "epoch": 0.35444, "grad_norm": 1.890625, "grad_norm_var": 0.006888834635416666, "learning_rate": 0.0001, "loss": 3.8608, "loss/crossentropy": 2.139336943626404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19530436396598816, "step": 17722 }, { "epoch": 0.35448, "grad_norm": 2.140625, "grad_norm_var": 0.009669748942057292, "learning_rate": 0.0001, "loss": 4.3803, "loss/crossentropy": 2.4077011346817017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209417074918747, "step": 17724 }, { "epoch": 0.35452, "grad_norm": 1.953125, "grad_norm_var": 0.009373982747395834, "learning_rate": 0.0001, "loss": 4.2412, "loss/crossentropy": 2.1217586994171143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052367851138115, "step": 17726 }, { "epoch": 0.35456, "grad_norm": 1.90625, "grad_norm_var": 0.009751129150390624, "learning_rate": 0.0001, "loss": 4.1872, "loss/crossentropy": 2.078941583633423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033359408378601, "step": 17728 }, { "epoch": 0.3546, "grad_norm": 2.09375, "grad_norm_var": 0.0072265625, "learning_rate": 0.0001, "loss": 4.3514, "loss/crossentropy": 2.1763141751289368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22765995562076569, "step": 17730 }, { "epoch": 0.35464, "grad_norm": 2.09375, "grad_norm_var": 0.00784912109375, "learning_rate": 0.0001, "loss": 4.1094, "loss/crossentropy": 1.83991938829422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18603162467479706, "step": 17732 }, { "epoch": 0.35468, "grad_norm": 2.015625, "grad_norm_var": 0.009132639567057291, "learning_rate": 0.0001, "loss": 3.93, "loss/crossentropy": 2.021396040916443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19647164642810822, "step": 17734 }, { "epoch": 0.35472, "grad_norm": 2.0, "grad_norm_var": 0.009297434488932292, "learning_rate": 0.0001, "loss": 4.2137, "loss/crossentropy": 2.2339202165603638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20721730589866638, "step": 17736 }, { "epoch": 0.35476, "grad_norm": 1.9921875, "grad_norm_var": 0.008017730712890626, "learning_rate": 0.0001, "loss": 4.1859, "loss/crossentropy": 2.2280211448669434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21610353142023087, "step": 17738 }, { "epoch": 0.3548, "grad_norm": 1.8359375, "grad_norm_var": 0.0082672119140625, "learning_rate": 0.0001, "loss": 3.8836, "loss/crossentropy": 1.8112387657165527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18565281480550766, "step": 17740 }, { "epoch": 0.35484, "grad_norm": 2.140625, "grad_norm_var": 0.009098307291666666, "learning_rate": 0.0001, "loss": 4.5429, "loss/crossentropy": 2.1125651597976685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1949760466814041, "step": 17742 }, { "epoch": 0.35488, "grad_norm": 1.96875, "grad_norm_var": 0.009163411458333333, "learning_rate": 0.0001, "loss": 3.9613, "loss/crossentropy": 2.1020091772079468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126646414399147, "step": 17744 }, { "epoch": 0.35492, "grad_norm": 2.046875, "grad_norm_var": 0.010553995768229166, "learning_rate": 0.0001, "loss": 4.0522, "loss/crossentropy": 1.8875654339790344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1913822665810585, "step": 17746 }, { "epoch": 0.35496, "grad_norm": 2.140625, "grad_norm_var": 0.011921946207682292, "learning_rate": 0.0001, "loss": 4.2408, "loss/crossentropy": 2.1043163537979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18798817694187164, "step": 17748 }, { "epoch": 0.355, "grad_norm": 1.9921875, "grad_norm_var": 0.012373606363932291, "learning_rate": 0.0001, "loss": 4.0744, "loss/crossentropy": 2.0751022696495056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19110247492790222, "step": 17750 }, { "epoch": 0.35504, "grad_norm": 1.9921875, "grad_norm_var": 0.011815388997395834, "learning_rate": 0.0001, "loss": 4.2077, "loss/crossentropy": 2.0335286259651184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18653328716754913, "step": 17752 }, { "epoch": 0.35508, "grad_norm": 2.140625, "grad_norm_var": 0.012230428059895833, "learning_rate": 0.0001, "loss": 4.2833, "loss/crossentropy": 2.091266691684723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19435560703277588, "step": 17754 }, { "epoch": 0.35512, "grad_norm": 2.265625, "grad_norm_var": 0.015702056884765624, "learning_rate": 0.0001, "loss": 4.4468, "loss/crossentropy": 2.2889565229415894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21760451793670654, "step": 17756 }, { "epoch": 0.35516, "grad_norm": 2.140625, "grad_norm_var": 0.0162017822265625, "learning_rate": 0.0001, "loss": 4.1216, "loss/crossentropy": 1.818089485168457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18759652972221375, "step": 17758 }, { "epoch": 0.3552, "grad_norm": 1.9453125, "grad_norm_var": 0.015531158447265625, "learning_rate": 0.0001, "loss": 4.0322, "loss/crossentropy": 1.8579466938972473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20318903028964996, "step": 17760 }, { "epoch": 0.35524, "grad_norm": 2.140625, "grad_norm_var": 0.0152008056640625, "learning_rate": 0.0001, "loss": 4.058, "loss/crossentropy": 2.0463815927505493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20810134708881378, "step": 17762 }, { "epoch": 0.35528, "grad_norm": 1.859375, "grad_norm_var": 0.0146392822265625, "learning_rate": 0.0001, "loss": 3.9702, "loss/crossentropy": 1.8263658285140991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1781037524342537, "step": 17764 }, { "epoch": 0.35532, "grad_norm": 1.9453125, "grad_norm_var": 0.0123199462890625, "learning_rate": 0.0001, "loss": 4.0973, "loss/crossentropy": 2.0662325620651245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2161395400762558, "step": 17766 }, { "epoch": 0.35536, "grad_norm": 1.921875, "grad_norm_var": 0.013598378499348958, "learning_rate": 0.0001, "loss": 4.1028, "loss/crossentropy": 1.9901453256607056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20523327589035034, "step": 17768 }, { "epoch": 0.3554, "grad_norm": 2.03125, "grad_norm_var": 0.011766560872395833, "learning_rate": 0.0001, "loss": 4.1567, "loss/crossentropy": 1.9877265095710754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20279201865196228, "step": 17770 }, { "epoch": 0.35544, "grad_norm": 2.09375, "grad_norm_var": 0.013695271809895833, "learning_rate": 0.0001, "loss": 4.5072, "loss/crossentropy": 2.1376627683639526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20117300748825073, "step": 17772 }, { "epoch": 0.35548, "grad_norm": 1.953125, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 3.8024, "loss/crossentropy": 2.0174089074134827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042396143078804, "step": 17774 }, { "epoch": 0.35552, "grad_norm": 1.90625, "grad_norm_var": 0.019090779622395835, "learning_rate": 0.0001, "loss": 3.7872, "loss/crossentropy": 2.1650888919830322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062518149614334, "step": 17776 }, { "epoch": 0.35556, "grad_norm": 1.9140625, "grad_norm_var": 0.017836252848307293, "learning_rate": 0.0001, "loss": 4.275, "loss/crossentropy": 2.078373670578003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205182746052742, "step": 17778 }, { "epoch": 0.3556, "grad_norm": 1.9453125, "grad_norm_var": 0.017693837483723957, "learning_rate": 0.0001, "loss": 3.7109, "loss/crossentropy": 1.7496679425239563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17602672427892685, "step": 17780 }, { "epoch": 0.35564, "grad_norm": 1.8984375, "grad_norm_var": 0.01779963175455729, "learning_rate": 0.0001, "loss": 3.8767, "loss/crossentropy": 2.3132810592651367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21808438748121262, "step": 17782 }, { "epoch": 0.35568, "grad_norm": 2.046875, "grad_norm_var": 0.017210896809895834, "learning_rate": 0.0001, "loss": 3.8465, "loss/crossentropy": 1.9900661706924438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20264087617397308, "step": 17784 }, { "epoch": 0.35572, "grad_norm": 2.03125, "grad_norm_var": 0.01665013631184896, "learning_rate": 0.0001, "loss": 4.2735, "loss/crossentropy": 2.095883369445801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19925827533006668, "step": 17786 }, { "epoch": 0.35576, "grad_norm": 1.8671875, "grad_norm_var": 0.008771769205729167, "learning_rate": 0.0001, "loss": 3.9558, "loss/crossentropy": 1.9932443499565125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1864292174577713, "step": 17788 }, { "epoch": 0.3558, "grad_norm": 2.0625, "grad_norm_var": 0.008455149332682292, "learning_rate": 0.0001, "loss": 4.0811, "loss/crossentropy": 1.8468709588050842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18770359456539154, "step": 17790 }, { "epoch": 0.35584, "grad_norm": 1.921875, "grad_norm_var": 0.006502278645833333, "learning_rate": 0.0001, "loss": 4.1749, "loss/crossentropy": 2.293928623199463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20280499011278152, "step": 17792 }, { "epoch": 0.35588, "grad_norm": 2.0, "grad_norm_var": 0.00618896484375, "learning_rate": 0.0001, "loss": 4.0155, "loss/crossentropy": 1.8333890438079834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20070593804121017, "step": 17794 }, { "epoch": 0.35592, "grad_norm": 1.9296875, "grad_norm_var": 0.0061767578125, "learning_rate": 0.0001, "loss": 3.9903, "loss/crossentropy": 1.7173805236816406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17676317691802979, "step": 17796 }, { "epoch": 0.35596, "grad_norm": 1.953125, "grad_norm_var": 0.0058977762858072914, "learning_rate": 0.0001, "loss": 3.8196, "loss/crossentropy": 1.8198468685150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20926716923713684, "step": 17798 }, { "epoch": 0.356, "grad_norm": 2.015625, "grad_norm_var": 0.0054840087890625, "learning_rate": 0.0001, "loss": 4.2473, "loss/crossentropy": 2.354674279689789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23675478994846344, "step": 17800 }, { "epoch": 0.35604, "grad_norm": 1.96875, "grad_norm_var": 0.005610911051432291, "learning_rate": 0.0001, "loss": 4.1092, "loss/crossentropy": 2.055357277393341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18840713798999786, "step": 17802 }, { "epoch": 0.35608, "grad_norm": 1.8359375, "grad_norm_var": 0.007089996337890625, "learning_rate": 0.0001, "loss": 4.1351, "loss/crossentropy": 2.3461071252822876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22149299830198288, "step": 17804 }, { "epoch": 0.35612, "grad_norm": 1.9453125, "grad_norm_var": 0.0083892822265625, "learning_rate": 0.0001, "loss": 4.165, "loss/crossentropy": 2.2675124406814575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21928898990154266, "step": 17806 }, { "epoch": 0.35616, "grad_norm": 1.9453125, "grad_norm_var": 0.007346343994140625, "learning_rate": 0.0001, "loss": 4.2397, "loss/crossentropy": 2.239398717880249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20473385602235794, "step": 17808 }, { "epoch": 0.3562, "grad_norm": 1.8359375, "grad_norm_var": 0.009262847900390624, "learning_rate": 0.0001, "loss": 3.8152, "loss/crossentropy": 2.022661864757538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910046860575676, "step": 17810 }, { "epoch": 0.35624, "grad_norm": 1.8984375, "grad_norm_var": 0.009236653645833334, "learning_rate": 0.0001, "loss": 4.053, "loss/crossentropy": 1.874055802822113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19950387626886368, "step": 17812 }, { "epoch": 0.35628, "grad_norm": 1.9765625, "grad_norm_var": 0.009178670247395833, "learning_rate": 0.0001, "loss": 4.058, "loss/crossentropy": 1.7601851224899292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18487969785928726, "step": 17814 }, { "epoch": 0.35632, "grad_norm": 1.984375, "grad_norm_var": 0.008348592122395833, "learning_rate": 0.0001, "loss": 4.0428, "loss/crossentropy": 1.8841391801834106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17906954884529114, "step": 17816 }, { "epoch": 0.35636, "grad_norm": 1.7890625, "grad_norm_var": 0.011092122395833333, "learning_rate": 0.0001, "loss": 3.6292, "loss/crossentropy": 1.7313326597213745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16394374519586563, "step": 17818 }, { "epoch": 0.3564, "grad_norm": 1.96875, "grad_norm_var": 0.0076901753743489586, "learning_rate": 0.0001, "loss": 3.7718, "loss/crossentropy": 2.094825506210327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20041480660438538, "step": 17820 }, { "epoch": 0.35644, "grad_norm": 1.9296875, "grad_norm_var": 0.004854329427083333, "learning_rate": 0.0001, "loss": 4.075, "loss/crossentropy": 2.222295045852661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19912637770175934, "step": 17822 }, { "epoch": 0.35648, "grad_norm": 2.015625, "grad_norm_var": 0.005980428059895833, "learning_rate": 0.0001, "loss": 3.971, "loss/crossentropy": 2.1671608090400696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20070582628250122, "step": 17824 }, { "epoch": 0.35652, "grad_norm": 2.03125, "grad_norm_var": 0.006884511311848958, "learning_rate": 0.0001, "loss": 4.0805, "loss/crossentropy": 2.164097785949707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535782679915428, "step": 17826 }, { "epoch": 0.35656, "grad_norm": 1.8203125, "grad_norm_var": 0.008405558268229167, "learning_rate": 0.0001, "loss": 4.128, "loss/crossentropy": 1.738038182258606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17469938844442368, "step": 17828 }, { "epoch": 0.3566, "grad_norm": 1.96875, "grad_norm_var": 0.010636393229166667, "learning_rate": 0.0001, "loss": 4.3157, "loss/crossentropy": 2.3383474349975586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087705433368683, "step": 17830 }, { "epoch": 0.35664, "grad_norm": 1.7734375, "grad_norm_var": 0.012870279947916667, "learning_rate": 0.0001, "loss": 3.854, "loss/crossentropy": 2.0826202034950256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418098032474518, "step": 17832 }, { "epoch": 0.35668, "grad_norm": 2.109375, "grad_norm_var": 0.011287434895833334, "learning_rate": 0.0001, "loss": 4.1733, "loss/crossentropy": 1.943938970565796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20667240023612976, "step": 17834 }, { "epoch": 0.35672, "grad_norm": 1.9375, "grad_norm_var": 0.011226145426432292, "learning_rate": 0.0001, "loss": 4.1312, "loss/crossentropy": 1.9756113290786743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973763257265091, "step": 17836 }, { "epoch": 0.35676, "grad_norm": 1.8515625, "grad_norm_var": 0.012898763020833334, "learning_rate": 0.0001, "loss": 3.6895, "loss/crossentropy": 1.8583598732948303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17464379966259003, "step": 17838 }, { "epoch": 0.3568, "grad_norm": 1.9609375, "grad_norm_var": 0.011791737874348958, "learning_rate": 0.0001, "loss": 3.9421, "loss/crossentropy": 1.9786911606788635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20051777362823486, "step": 17840 }, { "epoch": 0.35684, "grad_norm": 1.984375, "grad_norm_var": 0.010414377848307291, "learning_rate": 0.0001, "loss": 4.0804, "loss/crossentropy": 2.1271785497665405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21754157543182373, "step": 17842 }, { "epoch": 0.35688, "grad_norm": 1.828125, "grad_norm_var": 0.009639485677083334, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 2.264941096305847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20562931895256042, "step": 17844 }, { "epoch": 0.35692, "grad_norm": 2.078125, "grad_norm_var": 0.0083160400390625, "learning_rate": 0.0001, "loss": 4.3865, "loss/crossentropy": 2.141028881072998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23258531093597412, "step": 17846 }, { "epoch": 0.35696, "grad_norm": 2.203125, "grad_norm_var": 0.009781901041666667, "learning_rate": 0.0001, "loss": 4.1484, "loss/crossentropy": 1.9527946710586548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24948758631944656, "step": 17848 }, { "epoch": 0.357, "grad_norm": 2.03125, "grad_norm_var": 0.008854166666666666, "learning_rate": 0.0001, "loss": 4.0745, "loss/crossentropy": 1.9595746397972107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19834068417549133, "step": 17850 }, { "epoch": 0.35704, "grad_norm": 1.8984375, "grad_norm_var": 0.010396321614583334, "learning_rate": 0.0001, "loss": 4.1407, "loss/crossentropy": 1.8414466977119446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19133952260017395, "step": 17852 }, { "epoch": 0.35708, "grad_norm": 1.984375, "grad_norm_var": 0.010326894124348958, "learning_rate": 0.0001, "loss": 3.6912, "loss/crossentropy": 1.6589071154594421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17579880356788635, "step": 17854 }, { "epoch": 0.35712, "grad_norm": 1.8671875, "grad_norm_var": 0.011213175455729167, "learning_rate": 0.0001, "loss": 3.8164, "loss/crossentropy": 1.8462252020835876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19051598757505417, "step": 17856 }, { "epoch": 0.35716, "grad_norm": 2.109375, "grad_norm_var": 0.015083567301432291, "learning_rate": 0.0001, "loss": 3.9453, "loss/crossentropy": 2.0473897457122803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21250004321336746, "step": 17858 }, { "epoch": 0.3572, "grad_norm": 2.078125, "grad_norm_var": 0.014664459228515624, "learning_rate": 0.0001, "loss": 4.1836, "loss/crossentropy": 2.220176875591278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22177650034427643, "step": 17860 }, { "epoch": 0.35724, "grad_norm": 1.9140625, "grad_norm_var": 0.13489176432291666, "learning_rate": 0.0001, "loss": 3.5816, "loss/crossentropy": 1.6774207949638367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16900332272052765, "step": 17862 }, { "epoch": 0.35728, "grad_norm": 2.0, "grad_norm_var": 0.13516616821289062, "learning_rate": 0.0001, "loss": 3.8075, "loss/crossentropy": 1.9064326286315918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925061270594597, "step": 17864 }, { "epoch": 0.35732, "grad_norm": 2.15625, "grad_norm_var": 0.1366607666015625, "learning_rate": 0.0001, "loss": 4.1698, "loss/crossentropy": 2.188231110572815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20198386162519455, "step": 17866 }, { "epoch": 0.35736, "grad_norm": 2.140625, "grad_norm_var": 0.13647359212239582, "learning_rate": 0.0001, "loss": 4.0141, "loss/crossentropy": 1.9380639791488647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19545384496450424, "step": 17868 }, { "epoch": 0.3574, "grad_norm": 2.015625, "grad_norm_var": 0.13407389322916666, "learning_rate": 0.0001, "loss": 4.1965, "loss/crossentropy": 1.995704710483551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19666064530611038, "step": 17870 }, { "epoch": 0.35744, "grad_norm": 1.96875, "grad_norm_var": 0.13017349243164061, "learning_rate": 0.0001, "loss": 4.0191, "loss/crossentropy": 2.2148354053497314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23265502601861954, "step": 17872 }, { "epoch": 0.35748, "grad_norm": 2.03125, "grad_norm_var": 0.1243072509765625, "learning_rate": 0.0001, "loss": 4.0021, "loss/crossentropy": 1.7091269493103027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21242853999137878, "step": 17874 }, { "epoch": 0.35752, "grad_norm": 2.109375, "grad_norm_var": 0.12219950358072916, "learning_rate": 0.0001, "loss": 4.3081, "loss/crossentropy": 2.2395507097244263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21505478769540787, "step": 17876 }, { "epoch": 0.35756, "grad_norm": 1.953125, "grad_norm_var": 0.008512369791666667, "learning_rate": 0.0001, "loss": 3.8636, "loss/crossentropy": 2.014355480670929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20956294238567352, "step": 17878 }, { "epoch": 0.3576, "grad_norm": 2.296875, "grad_norm_var": 0.012383778889973959, "learning_rate": 0.0001, "loss": 4.3642, "loss/crossentropy": 2.3606066703796387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22014011442661285, "step": 17880 }, { "epoch": 0.35764, "grad_norm": 1.9921875, "grad_norm_var": 0.011368560791015624, "learning_rate": 0.0001, "loss": 3.9368, "loss/crossentropy": 1.6152977347373962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1630660966038704, "step": 17882 }, { "epoch": 0.35768, "grad_norm": 1.953125, "grad_norm_var": 0.011138661702473959, "learning_rate": 0.0001, "loss": 4.156, "loss/crossentropy": 2.1977567076683044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20369882881641388, "step": 17884 }, { "epoch": 0.35772, "grad_norm": 1.8359375, "grad_norm_var": 0.012359364827473959, "learning_rate": 0.0001, "loss": 3.9927, "loss/crossentropy": 1.9582098126411438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20284517109394073, "step": 17886 }, { "epoch": 0.35776, "grad_norm": 1.8515625, "grad_norm_var": 0.013818105061848959, "learning_rate": 0.0001, "loss": 4.108, "loss/crossentropy": 2.1300426721572876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19376185536384583, "step": 17888 }, { "epoch": 0.3578, "grad_norm": 2.03125, "grad_norm_var": 0.01883519490559896, "learning_rate": 0.0001, "loss": 4.3648, "loss/crossentropy": 2.057590961456299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069013327360153, "step": 17890 }, { "epoch": 0.35784, "grad_norm": 2.046875, "grad_norm_var": 0.01968994140625, "learning_rate": 0.0001, "loss": 4.0006, "loss/crossentropy": 2.21635901927948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19591110199689865, "step": 17892 }, { "epoch": 0.35788, "grad_norm": 2.109375, "grad_norm_var": 0.019809722900390625, "learning_rate": 0.0001, "loss": 4.4868, "loss/crossentropy": 2.6530216932296753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24127788096666336, "step": 17894 }, { "epoch": 0.35792, "grad_norm": 1.9921875, "grad_norm_var": 0.013256581624348958, "learning_rate": 0.0001, "loss": 4.0727, "loss/crossentropy": 1.9931264519691467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20592481642961502, "step": 17896 }, { "epoch": 0.35796, "grad_norm": 2.0, "grad_norm_var": 0.013305409749348959, "learning_rate": 0.0001, "loss": 4.1574, "loss/crossentropy": 2.178193688392639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2204412743449211, "step": 17898 }, { "epoch": 0.358, "grad_norm": 1.96875, "grad_norm_var": 0.012878163655598959, "learning_rate": 0.0001, "loss": 4.1748, "loss/crossentropy": 2.192083954811096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22313552349805832, "step": 17900 }, { "epoch": 0.35804, "grad_norm": 1.9375, "grad_norm_var": 0.011281077067057292, "learning_rate": 0.0001, "loss": 4.1695, "loss/crossentropy": 2.228816568851471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2120378389954567, "step": 17902 }, { "epoch": 0.35808, "grad_norm": 1.9140625, "grad_norm_var": 0.010871378580729167, "learning_rate": 0.0001, "loss": 4.1369, "loss/crossentropy": 2.0921813249588013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19552303105592728, "step": 17904 }, { "epoch": 0.35812, "grad_norm": 1.9609375, "grad_norm_var": 0.004687245686848958, "learning_rate": 0.0001, "loss": 4.2206, "loss/crossentropy": 2.0568217635154724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19668184220790863, "step": 17906 }, { "epoch": 0.35816, "grad_norm": 1.9296875, "grad_norm_var": 0.0038937886555989584, "learning_rate": 0.0001, "loss": 3.9946, "loss/crossentropy": 1.8241485357284546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15895915031433105, "step": 17908 }, { "epoch": 0.3582, "grad_norm": 1.859375, "grad_norm_var": 0.003082021077473958, "learning_rate": 0.0001, "loss": 3.9904, "loss/crossentropy": 2.2571341395378113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2065516859292984, "step": 17910 }, { "epoch": 0.35824, "grad_norm": 1.890625, "grad_norm_var": 0.0032297770182291665, "learning_rate": 0.0001, "loss": 3.9289, "loss/crossentropy": 2.0472288727760315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21586833149194717, "step": 17912 }, { "epoch": 0.35828, "grad_norm": 1.984375, "grad_norm_var": 0.0035947163899739585, "learning_rate": 0.0001, "loss": 4.0652, "loss/crossentropy": 2.0345569252967834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19726714491844177, "step": 17914 }, { "epoch": 0.35832, "grad_norm": 1.984375, "grad_norm_var": 0.0037127176920572916, "learning_rate": 0.0001, "loss": 4.0294, "loss/crossentropy": 2.15024471282959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20401738584041595, "step": 17916 }, { "epoch": 0.35836, "grad_norm": 2.046875, "grad_norm_var": 0.004874420166015625, "learning_rate": 0.0001, "loss": 4.3122, "loss/crossentropy": 2.110986351966858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299470156431198, "step": 17918 }, { "epoch": 0.3584, "grad_norm": 2.03125, "grad_norm_var": 0.003897857666015625, "learning_rate": 0.0001, "loss": 4.2147, "loss/crossentropy": 1.7297720909118652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18035240471363068, "step": 17920 }, { "epoch": 0.35844, "grad_norm": 2.3125, "grad_norm_var": 0.01236572265625, "learning_rate": 0.0001, "loss": 4.3997, "loss/crossentropy": 1.9483368396759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2060721442103386, "step": 17922 }, { "epoch": 0.35848, "grad_norm": 1.9453125, "grad_norm_var": 0.0123443603515625, "learning_rate": 0.0001, "loss": 4.1527, "loss/crossentropy": 1.9840999841690063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20700833201408386, "step": 17924 }, { "epoch": 0.35852, "grad_norm": 2.0625, "grad_norm_var": 0.011525217692057292, "learning_rate": 0.0001, "loss": 4.1754, "loss/crossentropy": 2.4118131399154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20695140957832336, "step": 17926 }, { "epoch": 0.35856, "grad_norm": 1.9609375, "grad_norm_var": 0.011195627848307292, "learning_rate": 0.0001, "loss": 3.9363, "loss/crossentropy": 1.9709432721138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18202026933431625, "step": 17928 }, { "epoch": 0.3586, "grad_norm": 2.078125, "grad_norm_var": 0.010206858317057291, "learning_rate": 0.0001, "loss": 4.2972, "loss/crossentropy": 1.9450251460075378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21454186737537384, "step": 17930 }, { "epoch": 0.35864, "grad_norm": 2.03125, "grad_norm_var": 0.0094390869140625, "learning_rate": 0.0001, "loss": 4.3317, "loss/crossentropy": 2.0738271474838257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21376954019069672, "step": 17932 }, { "epoch": 0.35868, "grad_norm": 2.03125, "grad_norm_var": 0.010109202067057291, "learning_rate": 0.0001, "loss": 4.1613, "loss/crossentropy": 2.1857110261917114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2238282486796379, "step": 17934 }, { "epoch": 0.35872, "grad_norm": 1.96875, "grad_norm_var": 0.010643513997395833, "learning_rate": 0.0001, "loss": 4.2208, "loss/crossentropy": 2.2790093421936035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987098902463913, "step": 17936 }, { "epoch": 0.35876, "grad_norm": 1.8671875, "grad_norm_var": 0.0054280598958333336, "learning_rate": 0.0001, "loss": 3.9261, "loss/crossentropy": 2.1101399064064026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034989446401596, "step": 17938 }, { "epoch": 0.3588, "grad_norm": 1.8203125, "grad_norm_var": 0.005793253580729167, "learning_rate": 0.0001, "loss": 3.7024, "loss/crossentropy": 1.6923771500587463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18680892139673233, "step": 17940 }, { "epoch": 0.35884, "grad_norm": 1.8515625, "grad_norm_var": 0.005342356363932292, "learning_rate": 0.0001, "loss": 3.9282, "loss/crossentropy": 1.6772454977035522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17377988994121552, "step": 17942 }, { "epoch": 0.35888, "grad_norm": 1.921875, "grad_norm_var": 0.007722981770833333, "learning_rate": 0.0001, "loss": 4.141, "loss/crossentropy": 2.2448233366012573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21804184466600418, "step": 17944 }, { "epoch": 0.35892, "grad_norm": 1.953125, "grad_norm_var": 0.006688435872395833, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 1.934161365032196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1834414005279541, "step": 17946 }, { "epoch": 0.35896, "grad_norm": 2.0, "grad_norm_var": 0.007344563802083333, "learning_rate": 0.0001, "loss": 3.9943, "loss/crossentropy": 2.06991970539093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21084094047546387, "step": 17948 }, { "epoch": 0.359, "grad_norm": 1.984375, "grad_norm_var": 0.007916005452473958, "learning_rate": 0.0001, "loss": 4.0493, "loss/crossentropy": 2.0942054986953735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21652275323867798, "step": 17950 }, { "epoch": 0.35904, "grad_norm": 2.078125, "grad_norm_var": 0.009134928385416666, "learning_rate": 0.0001, "loss": 4.1125, "loss/crossentropy": 2.0005252361297607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198953777551651, "step": 17952 }, { "epoch": 0.35908, "grad_norm": 2.609375, "grad_norm_var": 0.0327301025390625, "learning_rate": 0.0001, "loss": 4.1639, "loss/crossentropy": 2.1563133597373962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072513848543167, "step": 17954 }, { "epoch": 0.35912, "grad_norm": 2.109375, "grad_norm_var": 0.0312896728515625, "learning_rate": 0.0001, "loss": 4.1789, "loss/crossentropy": 2.3533977270126343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21483703702688217, "step": 17956 }, { "epoch": 0.35916, "grad_norm": 1.9375, "grad_norm_var": 0.029581705729166668, "learning_rate": 0.0001, "loss": 4.0921, "loss/crossentropy": 2.1915602684020996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2241670861840248, "step": 17958 }, { "epoch": 0.3592, "grad_norm": 1.9765625, "grad_norm_var": 0.0314117431640625, "learning_rate": 0.0001, "loss": 3.9225, "loss/crossentropy": 1.9529941082000732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920214593410492, "step": 17960 }, { "epoch": 0.35924, "grad_norm": 2.015625, "grad_norm_var": 0.030350748697916666, "learning_rate": 0.0001, "loss": 4.4467, "loss/crossentropy": 2.308731436729431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2238362655043602, "step": 17962 }, { "epoch": 0.35928, "grad_norm": 2.03125, "grad_norm_var": 0.04334208170572917, "learning_rate": 0.0001, "loss": 4.1682, "loss/crossentropy": 2.0917986631393433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061680108308792, "step": 17964 }, { "epoch": 0.35932, "grad_norm": 2.265625, "grad_norm_var": 10.265104166666667, "learning_rate": 0.0001, "loss": 4.9501, "loss/crossentropy": 2.16925585269928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20239810645580292, "step": 17966 }, { "epoch": 0.35936, "grad_norm": 2.0, "grad_norm_var": 10.219252268473307, "learning_rate": 0.0001, "loss": 4.287, "loss/crossentropy": 2.1585946083068848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21381714940071106, "step": 17968 }, { "epoch": 0.3594, "grad_norm": 1.8359375, "grad_norm_var": 10.281192016601562, "learning_rate": 0.0001, "loss": 4.0133, "loss/crossentropy": 1.8752552270889282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19667387753725052, "step": 17970 }, { "epoch": 0.35944, "grad_norm": 1.9140625, "grad_norm_var": 10.291275024414062, "learning_rate": 0.0001, "loss": 3.9797, "loss/crossentropy": 1.8533543944358826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1844579428434372, "step": 17972 }, { "epoch": 0.35948, "grad_norm": 2.234375, "grad_norm_var": 10.282754516601562, "learning_rate": 0.0001, "loss": 4.5316, "loss/crossentropy": 2.5859906673431396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23142188042402267, "step": 17974 }, { "epoch": 0.35952, "grad_norm": 2.15625, "grad_norm_var": 10.236201985677083, "learning_rate": 0.0001, "loss": 4.2607, "loss/crossentropy": 2.2086023092269897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21764708310365677, "step": 17976 }, { "epoch": 0.35956, "grad_norm": 1.953125, "grad_norm_var": 10.258858235677083, "learning_rate": 0.0001, "loss": 4.2578, "loss/crossentropy": 2.3860682249069214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22804389894008636, "step": 17978 }, { "epoch": 0.3596, "grad_norm": 2.0, "grad_norm_var": 10.314815266927083, "learning_rate": 0.0001, "loss": 4.2055, "loss/crossentropy": 2.1412216424942017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200544573366642, "step": 17980 }, { "epoch": 0.35964, "grad_norm": 1.9140625, "grad_norm_var": 0.022395833333333334, "learning_rate": 0.0001, "loss": 4.0412, "loss/crossentropy": 2.198704957962036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043847218155861, "step": 17982 }, { "epoch": 0.35968, "grad_norm": 1.90625, "grad_norm_var": 0.010249837239583334, "learning_rate": 0.0001, "loss": 4.037, "loss/crossentropy": 2.0053776502609253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072744369506836, "step": 17984 }, { "epoch": 0.35972, "grad_norm": 2.0625, "grad_norm_var": 0.04221979777018229, "learning_rate": 0.0001, "loss": 4.127, "loss/crossentropy": 2.5015710592269897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23083895444869995, "step": 17986 }, { "epoch": 0.35976, "grad_norm": 2.015625, "grad_norm_var": 0.04145889282226563, "learning_rate": 0.0001, "loss": 4.0531, "loss/crossentropy": 2.037220776081085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19487662613391876, "step": 17988 }, { "epoch": 0.3598, "grad_norm": 1.921875, "grad_norm_var": 0.03911107381184896, "learning_rate": 0.0001, "loss": 3.9801, "loss/crossentropy": 1.9316660165786743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183449983596802, "step": 17990 }, { "epoch": 0.35984, "grad_norm": 1.828125, "grad_norm_var": 0.04090067545572917, "learning_rate": 0.0001, "loss": 3.804, "loss/crossentropy": 1.892772138118744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17082024365663528, "step": 17992 }, { "epoch": 0.35988, "grad_norm": 1.9453125, "grad_norm_var": 0.04088109334309896, "learning_rate": 0.0001, "loss": 4.0732, "loss/crossentropy": 1.9325169324874878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900368332862854, "step": 17994 }, { "epoch": 0.35992, "grad_norm": 2.421875, "grad_norm_var": 0.050388336181640625, "learning_rate": 0.0001, "loss": 4.1392, "loss/crossentropy": 1.9746126532554626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21207977831363678, "step": 17996 }, { "epoch": 0.35996, "grad_norm": 2.03125, "grad_norm_var": 0.048620351155598956, "learning_rate": 0.0001, "loss": 4.0308, "loss/crossentropy": 2.1249493956565857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18674498051404953, "step": 17998 }, { "epoch": 0.36, "grad_norm": 2.046875, "grad_norm_var": 0.04625422159830729, "learning_rate": 0.0001, "loss": 4.0854, "loss/crossentropy": 2.1279499530792236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214716374874115, "step": 18000 }, { "epoch": 0.36004, "grad_norm": 1.9765625, "grad_norm_var": 0.0177398681640625, "learning_rate": 0.0001, "loss": 3.9238, "loss/crossentropy": 2.301763415336609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20897136628627777, "step": 18002 }, { "epoch": 0.36008, "grad_norm": 1.9921875, "grad_norm_var": 0.01788330078125, "learning_rate": 0.0001, "loss": 4.0944, "loss/crossentropy": 1.8079062104225159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19506293535232544, "step": 18004 }, { "epoch": 0.36012, "grad_norm": 1.890625, "grad_norm_var": 0.0205078125, "learning_rate": 0.0001, "loss": 4.0503, "loss/crossentropy": 1.9429230093955994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19762147217988968, "step": 18006 }, { "epoch": 0.36016, "grad_norm": 1.9296875, "grad_norm_var": 0.019608561197916666, "learning_rate": 0.0001, "loss": 3.9801, "loss/crossentropy": 2.307368576526642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21261631697416306, "step": 18008 }, { "epoch": 0.3602, "grad_norm": 1.8203125, "grad_norm_var": 0.021952311197916668, "learning_rate": 0.0001, "loss": 3.8723, "loss/crossentropy": 2.2834020853042603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20086131989955902, "step": 18010 }, { "epoch": 0.36024, "grad_norm": 1.875, "grad_norm_var": 0.0117340087890625, "learning_rate": 0.0001, "loss": 4.1708, "loss/crossentropy": 2.1730109453201294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20470792055130005, "step": 18012 }, { "epoch": 0.36028, "grad_norm": 2.109375, "grad_norm_var": 0.012809244791666667, "learning_rate": 0.0001, "loss": 4.4047, "loss/crossentropy": 2.491591691970825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2361597716808319, "step": 18014 }, { "epoch": 0.36032, "grad_norm": 1.9296875, "grad_norm_var": 0.013240305582682292, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 2.024519979953766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042919620871544, "step": 18016 }, { "epoch": 0.36036, "grad_norm": 1.7734375, "grad_norm_var": 0.016123199462890626, "learning_rate": 0.0001, "loss": 3.9521, "loss/crossentropy": 2.055600941181183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17253537476062775, "step": 18018 }, { "epoch": 0.3604, "grad_norm": 2.046875, "grad_norm_var": 0.0188629150390625, "learning_rate": 0.0001, "loss": 3.9665, "loss/crossentropy": 1.8796368837356567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21168731898069382, "step": 18020 }, { "epoch": 0.36044, "grad_norm": 1.9140625, "grad_norm_var": 0.014825185139973959, "learning_rate": 0.0001, "loss": 4.0291, "loss/crossentropy": 1.9885223507881165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20676057040691376, "step": 18022 }, { "epoch": 0.36048, "grad_norm": 1.90625, "grad_norm_var": 0.010839589436848958, "learning_rate": 0.0001, "loss": 3.8401, "loss/crossentropy": 2.0412577986717224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18211868405342102, "step": 18024 }, { "epoch": 0.36052, "grad_norm": 1.828125, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 3.8696, "loss/crossentropy": 2.1391053199768066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20121745765209198, "step": 18026 }, { "epoch": 0.36056, "grad_norm": 2.046875, "grad_norm_var": 0.010087076822916667, "learning_rate": 0.0001, "loss": 3.9683, "loss/crossentropy": 1.8888981938362122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19372491538524628, "step": 18028 }, { "epoch": 0.3606, "grad_norm": 2.015625, "grad_norm_var": 0.007087198893229166, "learning_rate": 0.0001, "loss": 4.2467, "loss/crossentropy": 2.002028524875641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18978165835142136, "step": 18030 }, { "epoch": 0.36064, "grad_norm": 1.828125, "grad_norm_var": 0.007828776041666667, "learning_rate": 0.0001, "loss": 4.0703, "loss/crossentropy": 2.3448036909103394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21162079274654388, "step": 18032 }, { "epoch": 0.36068, "grad_norm": 2.5625, "grad_norm_var": 0.03154474894205729, "learning_rate": 0.0001, "loss": 4.283, "loss/crossentropy": 2.086554765701294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25456031411886215, "step": 18034 }, { "epoch": 0.36072, "grad_norm": 1.8671875, "grad_norm_var": 0.02948582967122396, "learning_rate": 0.0001, "loss": 3.8122, "loss/crossentropy": 1.9166680574417114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18382181227207184, "step": 18036 }, { "epoch": 0.36076, "grad_norm": 1.8984375, "grad_norm_var": 0.029642740885416668, "learning_rate": 0.0001, "loss": 3.9937, "loss/crossentropy": 2.050579786300659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20126450806856155, "step": 18038 }, { "epoch": 0.3608, "grad_norm": 1.9453125, "grad_norm_var": 0.030049641927083332, "learning_rate": 0.0001, "loss": 3.9273, "loss/crossentropy": 2.2339882850646973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20789660513401031, "step": 18040 }, { "epoch": 0.36084, "grad_norm": 1.8984375, "grad_norm_var": 0.029243977864583333, "learning_rate": 0.0001, "loss": 4.0086, "loss/crossentropy": 2.248544931411743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20925041288137436, "step": 18042 }, { "epoch": 0.36088, "grad_norm": 2.046875, "grad_norm_var": 0.028955078125, "learning_rate": 0.0001, "loss": 4.0448, "loss/crossentropy": 1.8894963264465332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18232882767915726, "step": 18044 }, { "epoch": 0.36092, "grad_norm": 2.046875, "grad_norm_var": 0.0289703369140625, "learning_rate": 0.0001, "loss": 4.1838, "loss/crossentropy": 2.0220844745635986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18812591582536697, "step": 18046 }, { "epoch": 0.36096, "grad_norm": 1.9921875, "grad_norm_var": 0.027913411458333332, "learning_rate": 0.0001, "loss": 4.2088, "loss/crossentropy": 2.1896166801452637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205679252743721, "step": 18048 }, { "epoch": 0.361, "grad_norm": 2.046875, "grad_norm_var": 0.004686482747395833, "learning_rate": 0.0001, "loss": 4.0511, "loss/crossentropy": 2.0670089721679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18881987780332565, "step": 18050 }, { "epoch": 0.36104, "grad_norm": 2.046875, "grad_norm_var": 0.005500284830729166, "learning_rate": 0.0001, "loss": 4.2426, "loss/crossentropy": 2.155470609664917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147108018398285, "step": 18052 }, { "epoch": 0.36108, "grad_norm": 2.15625, "grad_norm_var": 0.0076812744140625, "learning_rate": 0.0001, "loss": 4.2007, "loss/crossentropy": 1.8182223439216614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18691811710596085, "step": 18054 }, { "epoch": 0.36112, "grad_norm": 1.9453125, "grad_norm_var": 0.0066314697265625, "learning_rate": 0.0001, "loss": 3.8601, "loss/crossentropy": 1.7366089820861816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18361609429121017, "step": 18056 }, { "epoch": 0.36116, "grad_norm": 1.96875, "grad_norm_var": 0.005704498291015625, "learning_rate": 0.0001, "loss": 4.027, "loss/crossentropy": 1.9429153203964233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974082887172699, "step": 18058 }, { "epoch": 0.3612, "grad_norm": 1.9453125, "grad_norm_var": 0.00562744140625, "learning_rate": 0.0001, "loss": 4.2751, "loss/crossentropy": 2.230627417564392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22216492146253586, "step": 18060 }, { "epoch": 0.36124, "grad_norm": 1.9140625, "grad_norm_var": 0.006371053059895834, "learning_rate": 0.0001, "loss": 4.1759, "loss/crossentropy": 2.0506786704063416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20492341369390488, "step": 18062 }, { "epoch": 0.36128, "grad_norm": 1.8984375, "grad_norm_var": 0.007116444905598958, "learning_rate": 0.0001, "loss": 3.778, "loss/crossentropy": 2.025477647781372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20034795254468918, "step": 18064 }, { "epoch": 0.36132, "grad_norm": 2.046875, "grad_norm_var": 0.006617991129557291, "learning_rate": 0.0001, "loss": 4.2998, "loss/crossentropy": 1.9828922748565674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19445167481899261, "step": 18066 }, { "epoch": 0.36136, "grad_norm": 2.046875, "grad_norm_var": 0.006485748291015625, "learning_rate": 0.0001, "loss": 4.2566, "loss/crossentropy": 2.2869513630867004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21809721738100052, "step": 18068 }, { "epoch": 0.3614, "grad_norm": 1.875, "grad_norm_var": 0.004129791259765625, "learning_rate": 0.0001, "loss": 3.9636, "loss/crossentropy": 2.1584482192993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21123671531677246, "step": 18070 }, { "epoch": 0.36144, "grad_norm": 1.9296875, "grad_norm_var": 0.0046078999837239586, "learning_rate": 0.0001, "loss": 3.9628, "loss/crossentropy": 2.112669587135315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795845240354538, "step": 18072 }, { "epoch": 0.36148, "grad_norm": 2.09375, "grad_norm_var": 0.0059234619140625, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 2.3283581733703613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22724319994449615, "step": 18074 }, { "epoch": 0.36152, "grad_norm": 1.8671875, "grad_norm_var": 0.006078084309895833, "learning_rate": 0.0001, "loss": 3.8966, "loss/crossentropy": 2.286331057548523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20937074720859528, "step": 18076 }, { "epoch": 0.36156, "grad_norm": 1.7578125, "grad_norm_var": 0.009110260009765624, "learning_rate": 0.0001, "loss": 4.1124, "loss/crossentropy": 2.1055954694747925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924988478422165, "step": 18078 }, { "epoch": 0.3616, "grad_norm": 1.9921875, "grad_norm_var": 0.008186594645182291, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 2.180675983428955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21886169910430908, "step": 18080 }, { "epoch": 0.36164, "grad_norm": 2.078125, "grad_norm_var": 0.008621978759765624, "learning_rate": 0.0001, "loss": 3.9958, "loss/crossentropy": 1.7595775127410889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17430071532726288, "step": 18082 }, { "epoch": 0.36168, "grad_norm": 1.921875, "grad_norm_var": 0.011472320556640625, "learning_rate": 0.0001, "loss": 4.0306, "loss/crossentropy": 2.1191208958625793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19574576616287231, "step": 18084 }, { "epoch": 0.36172, "grad_norm": 1.9921875, "grad_norm_var": 0.0110595703125, "learning_rate": 0.0001, "loss": 4.1563, "loss/crossentropy": 2.0964609384536743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19742169976234436, "step": 18086 }, { "epoch": 0.36176, "grad_norm": 1.9453125, "grad_norm_var": 0.0108154296875, "learning_rate": 0.0001, "loss": 4.3584, "loss/crossentropy": 2.3081077337265015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2177809551358223, "step": 18088 }, { "epoch": 0.3618, "grad_norm": 1.9453125, "grad_norm_var": 0.009928385416666666, "learning_rate": 0.0001, "loss": 4.0493, "loss/crossentropy": 2.340154528617859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23811470717191696, "step": 18090 }, { "epoch": 0.36184, "grad_norm": 1.9140625, "grad_norm_var": 0.009777577718098958, "learning_rate": 0.0001, "loss": 3.828, "loss/crossentropy": 1.9872968196868896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19391798973083496, "step": 18092 }, { "epoch": 0.36188, "grad_norm": 1.984375, "grad_norm_var": 0.0067779541015625, "learning_rate": 0.0001, "loss": 3.9404, "loss/crossentropy": 1.9162002205848694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203052818775177, "step": 18094 }, { "epoch": 0.36192, "grad_norm": 1.7890625, "grad_norm_var": 0.009718577067057291, "learning_rate": 0.0001, "loss": 3.7458, "loss/crossentropy": 2.240887403488159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19722212105989456, "step": 18096 }, { "epoch": 0.36196, "grad_norm": 2.078125, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 4.1473, "loss/crossentropy": 1.8601738214492798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938677802681923, "step": 18098 }, { "epoch": 0.362, "grad_norm": 1.84375, "grad_norm_var": 0.005729166666666666, "learning_rate": 0.0001, "loss": 4.0131, "loss/crossentropy": 1.9588143229484558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19559185206890106, "step": 18100 }, { "epoch": 0.36204, "grad_norm": 1.8359375, "grad_norm_var": 0.0060618082682291664, "learning_rate": 0.0001, "loss": 3.9199, "loss/crossentropy": 1.909518837928772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974424198269844, "step": 18102 }, { "epoch": 0.36208, "grad_norm": 1.8203125, "grad_norm_var": 0.005641428629557291, "learning_rate": 0.0001, "loss": 4.15, "loss/crossentropy": 2.0762908458709717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981927454471588, "step": 18104 }, { "epoch": 0.36212, "grad_norm": 2.015625, "grad_norm_var": 0.006339263916015625, "learning_rate": 0.0001, "loss": 4.3226, "loss/crossentropy": 2.4555106163024902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001618146896362, "step": 18106 }, { "epoch": 0.36216, "grad_norm": 1.7734375, "grad_norm_var": 0.013533528645833333, "learning_rate": 0.0001, "loss": 4.0595, "loss/crossentropy": 1.9983150959014893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19945750385522842, "step": 18108 }, { "epoch": 0.3622, "grad_norm": 1.90625, "grad_norm_var": 0.013181304931640625, "learning_rate": 0.0001, "loss": 4.0863, "loss/crossentropy": 2.152313530445099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21388111263513565, "step": 18110 }, { "epoch": 0.36224, "grad_norm": 2.125, "grad_norm_var": 0.016886393229166668, "learning_rate": 0.0001, "loss": 4.1128, "loss/crossentropy": 1.690669596195221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1883523240685463, "step": 18112 }, { "epoch": 0.36228, "grad_norm": 2.09375, "grad_norm_var": 0.017179107666015624, "learning_rate": 0.0001, "loss": 3.9978, "loss/crossentropy": 1.9404500126838684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18945956230163574, "step": 18114 }, { "epoch": 0.36232, "grad_norm": 1.9765625, "grad_norm_var": 0.016007486979166666, "learning_rate": 0.0001, "loss": 4.4396, "loss/crossentropy": 2.213471293449402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22550443559885025, "step": 18116 }, { "epoch": 0.36236, "grad_norm": 2.015625, "grad_norm_var": 0.014233144124348958, "learning_rate": 0.0001, "loss": 4.3084, "loss/crossentropy": 2.1008135080337524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002386674284935, "step": 18118 }, { "epoch": 0.3624, "grad_norm": 1.8359375, "grad_norm_var": 0.013866933186848958, "learning_rate": 0.0001, "loss": 3.8664, "loss/crossentropy": 1.9704426527023315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1806606948375702, "step": 18120 }, { "epoch": 0.36244, "grad_norm": 1.890625, "grad_norm_var": 0.01473388671875, "learning_rate": 0.0001, "loss": 3.7456, "loss/crossentropy": 1.8658949732780457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19861871004104614, "step": 18122 }, { "epoch": 0.36248, "grad_norm": 2.0625, "grad_norm_var": 0.011034901936848958, "learning_rate": 0.0001, "loss": 4.0341, "loss/crossentropy": 1.9327979683876038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19395866990089417, "step": 18124 }, { "epoch": 0.36252, "grad_norm": 1.984375, "grad_norm_var": 0.009641265869140625, "learning_rate": 0.0001, "loss": 4.071, "loss/crossentropy": 2.377102255821228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21928569674491882, "step": 18126 }, { "epoch": 0.36256, "grad_norm": 2.0625, "grad_norm_var": 0.006994374593098958, "learning_rate": 0.0001, "loss": 3.9623, "loss/crossentropy": 2.1650161743164062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2292800396680832, "step": 18128 }, { "epoch": 0.3626, "grad_norm": 2.078125, "grad_norm_var": 0.007458241780598959, "learning_rate": 0.0001, "loss": 4.3135, "loss/crossentropy": 2.281827986240387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190621793270111, "step": 18130 }, { "epoch": 0.36264, "grad_norm": 1.9140625, "grad_norm_var": 0.00848388671875, "learning_rate": 0.0001, "loss": 3.8335, "loss/crossentropy": 2.0073219537734985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18984179198741913, "step": 18132 }, { "epoch": 0.36268, "grad_norm": 2.0625, "grad_norm_var": 0.01141357421875, "learning_rate": 0.0001, "loss": 3.7551, "loss/crossentropy": 1.7675580978393555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18771683424711227, "step": 18134 }, { "epoch": 0.36272, "grad_norm": 1.8671875, "grad_norm_var": 0.011154937744140624, "learning_rate": 0.0001, "loss": 3.8816, "loss/crossentropy": 2.064103126525879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20835159718990326, "step": 18136 }, { "epoch": 0.36276, "grad_norm": 2.140625, "grad_norm_var": 0.0126861572265625, "learning_rate": 0.0001, "loss": 4.2122, "loss/crossentropy": 2.1821314096450806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2391536980867386, "step": 18138 }, { "epoch": 0.3628, "grad_norm": 1.8984375, "grad_norm_var": 0.010117340087890624, "learning_rate": 0.0001, "loss": 4.2307, "loss/crossentropy": 2.295500636100769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215906023979187, "step": 18140 }, { "epoch": 0.36284, "grad_norm": 1.9609375, "grad_norm_var": 0.010245768229166667, "learning_rate": 0.0001, "loss": 4.1144, "loss/crossentropy": 2.0736570954322815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18491360545158386, "step": 18142 }, { "epoch": 0.36288, "grad_norm": 1.9296875, "grad_norm_var": 0.009496053059895834, "learning_rate": 0.0001, "loss": 4.1326, "loss/crossentropy": 2.0513535737991333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20204297453165054, "step": 18144 }, { "epoch": 0.36292, "grad_norm": 2.140625, "grad_norm_var": 0.010601552327473958, "learning_rate": 0.0001, "loss": 4.1648, "loss/crossentropy": 2.1471269130706787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204594686627388, "step": 18146 }, { "epoch": 0.36296, "grad_norm": 2.125, "grad_norm_var": 0.011372884114583334, "learning_rate": 0.0001, "loss": 4.2728, "loss/crossentropy": 2.093555986881256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20173487067222595, "step": 18148 }, { "epoch": 0.363, "grad_norm": 1.6953125, "grad_norm_var": 0.013627115885416667, "learning_rate": 0.0001, "loss": 3.7183, "loss/crossentropy": 1.6938685178756714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1703190803527832, "step": 18150 }, { "epoch": 0.36304, "grad_norm": 2.046875, "grad_norm_var": 0.013337961832682292, "learning_rate": 0.0001, "loss": 4.116, "loss/crossentropy": 2.180016875267029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23546195775270462, "step": 18152 }, { "epoch": 0.36308, "grad_norm": 2.015625, "grad_norm_var": 0.011277008056640624, "learning_rate": 0.0001, "loss": 4.1826, "loss/crossentropy": 2.145975947380066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20554804801940918, "step": 18154 }, { "epoch": 0.36312, "grad_norm": 1.953125, "grad_norm_var": 0.011131795247395833, "learning_rate": 0.0001, "loss": 4.0113, "loss/crossentropy": 2.068985939025879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20410801470279694, "step": 18156 }, { "epoch": 0.36316, "grad_norm": 1.8671875, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0001, "loss": 3.8273, "loss/crossentropy": 1.8428975343704224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19165785610675812, "step": 18158 }, { "epoch": 0.3632, "grad_norm": 1.875, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 3.7808, "loss/crossentropy": 1.859747588634491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18524104356765747, "step": 18160 }, { "epoch": 0.36324, "grad_norm": 2.0, "grad_norm_var": 0.011521148681640624, "learning_rate": 0.0001, "loss": 4.1458, "loss/crossentropy": 2.1383343935012817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21649780869483948, "step": 18162 }, { "epoch": 0.36328, "grad_norm": 2.078125, "grad_norm_var": 0.010758209228515624, "learning_rate": 0.0001, "loss": 3.9889, "loss/crossentropy": 2.0153123140335083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131347879767418, "step": 18164 }, { "epoch": 0.36332, "grad_norm": 1.875, "grad_norm_var": 0.007004547119140625, "learning_rate": 0.0001, "loss": 4.0827, "loss/crossentropy": 1.8382813930511475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18116765469312668, "step": 18166 }, { "epoch": 0.36336, "grad_norm": 1.9609375, "grad_norm_var": 0.007287343343098958, "learning_rate": 0.0001, "loss": 3.8883, "loss/crossentropy": 1.9639039039611816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014933079481125, "step": 18168 }, { "epoch": 0.3634, "grad_norm": 2.21875, "grad_norm_var": 0.012446848551432292, "learning_rate": 0.0001, "loss": 4.2886, "loss/crossentropy": 2.057366132736206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134721800684929, "step": 18170 }, { "epoch": 0.36344, "grad_norm": 1.953125, "grad_norm_var": 0.012690989176432292, "learning_rate": 0.0001, "loss": 4.0801, "loss/crossentropy": 2.244979500770569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219241164624691, "step": 18172 }, { "epoch": 0.36348, "grad_norm": 1.921875, "grad_norm_var": 0.010625966389973958, "learning_rate": 0.0001, "loss": 4.1271, "loss/crossentropy": 2.1159361600875854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19503474980592728, "step": 18174 }, { "epoch": 0.36352, "grad_norm": 1.9765625, "grad_norm_var": 0.0099761962890625, "learning_rate": 0.0001, "loss": 4.2031, "loss/crossentropy": 2.240646004676819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22175125032663345, "step": 18176 }, { "epoch": 0.36356, "grad_norm": 1.8984375, "grad_norm_var": 0.011327107747395834, "learning_rate": 0.0001, "loss": 3.9508, "loss/crossentropy": 1.7946885228157043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17505639791488647, "step": 18178 }, { "epoch": 0.3636, "grad_norm": 1.9296875, "grad_norm_var": 0.010420735677083333, "learning_rate": 0.0001, "loss": 4.1138, "loss/crossentropy": 2.2058286666870117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21548640727996826, "step": 18180 }, { "epoch": 0.36364, "grad_norm": 1.9921875, "grad_norm_var": 0.0101226806640625, "learning_rate": 0.0001, "loss": 4.0831, "loss/crossentropy": 2.168944835662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22227579355239868, "step": 18182 }, { "epoch": 0.36368, "grad_norm": 1.96875, "grad_norm_var": 0.008392079671223959, "learning_rate": 0.0001, "loss": 4.1482, "loss/crossentropy": 2.019882082939148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20161078870296478, "step": 18184 }, { "epoch": 0.36372, "grad_norm": 1.9375, "grad_norm_var": 0.0034993489583333335, "learning_rate": 0.0001, "loss": 4.1305, "loss/crossentropy": 2.000952959060669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19938969612121582, "step": 18186 }, { "epoch": 0.36376, "grad_norm": 1.9921875, "grad_norm_var": 0.0024920145670572916, "learning_rate": 0.0001, "loss": 3.9158, "loss/crossentropy": 2.0612798929214478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022939696907997, "step": 18188 }, { "epoch": 0.3638, "grad_norm": 1.78125, "grad_norm_var": 0.0042307535807291664, "learning_rate": 0.0001, "loss": 3.8822, "loss/crossentropy": 1.664880096912384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16950316727161407, "step": 18190 }, { "epoch": 0.36384, "grad_norm": 1.875, "grad_norm_var": 0.003360748291015625, "learning_rate": 0.0001, "loss": 3.9881, "loss/crossentropy": 1.8047854900360107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1823924407362938, "step": 18192 }, { "epoch": 0.36388, "grad_norm": 1.8359375, "grad_norm_var": 0.0060808817545572914, "learning_rate": 0.0001, "loss": 3.7857, "loss/crossentropy": 2.167177438735962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963154897093773, "step": 18194 }, { "epoch": 0.36392, "grad_norm": 1.875, "grad_norm_var": 0.006151326497395833, "learning_rate": 0.0001, "loss": 4.0372, "loss/crossentropy": 2.396019458770752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20359576493501663, "step": 18196 }, { "epoch": 0.36396, "grad_norm": 1.9375, "grad_norm_var": 0.005866495768229166, "learning_rate": 0.0001, "loss": 3.9349, "loss/crossentropy": 2.0866541862487793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082398235797882, "step": 18198 }, { "epoch": 0.364, "grad_norm": 2.015625, "grad_norm_var": 0.0060791015625, "learning_rate": 0.0001, "loss": 4.1046, "loss/crossentropy": 2.170537829399109, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19030719250440598, "step": 18200 }, { "epoch": 0.36404, "grad_norm": 2.328125, "grad_norm_var": 0.024234771728515625, "learning_rate": 0.0001, "loss": 4.3921, "loss/crossentropy": 1.938852846622467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19586428999900818, "step": 18202 }, { "epoch": 0.36408, "grad_norm": 1.9453125, "grad_norm_var": 0.023996734619140626, "learning_rate": 0.0001, "loss": 4.0629, "loss/crossentropy": 1.8909979462623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17625273764133453, "step": 18204 }, { "epoch": 0.36412, "grad_norm": 1.9296875, "grad_norm_var": 0.022078450520833334, "learning_rate": 0.0001, "loss": 4.1729, "loss/crossentropy": 1.9127016067504883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1888885200023651, "step": 18206 }, { "epoch": 0.36416, "grad_norm": 1.9296875, "grad_norm_var": 0.03103612263997396, "learning_rate": 0.0001, "loss": 4.1826, "loss/crossentropy": 2.182482957839966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21610775589942932, "step": 18208 }, { "epoch": 0.3642, "grad_norm": 1.8828125, "grad_norm_var": 0.027497355143229166, "learning_rate": 0.0001, "loss": 4.1941, "loss/crossentropy": 2.0900736451148987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20886409282684326, "step": 18210 }, { "epoch": 0.36424, "grad_norm": 2.0, "grad_norm_var": 0.0248199462890625, "learning_rate": 0.0001, "loss": 3.9184, "loss/crossentropy": 2.2132604122161865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21213480830192566, "step": 18212 }, { "epoch": 0.36428, "grad_norm": 2.171875, "grad_norm_var": 0.024825032552083334, "learning_rate": 0.0001, "loss": 4.0002, "loss/crossentropy": 1.7810762524604797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188262477517128, "step": 18214 }, { "epoch": 0.36432, "grad_norm": 1.984375, "grad_norm_var": 0.02490208943684896, "learning_rate": 0.0001, "loss": 3.8339, "loss/crossentropy": 2.072141647338867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20580865442752838, "step": 18216 }, { "epoch": 0.36436, "grad_norm": 2.125, "grad_norm_var": 0.016778310139973957, "learning_rate": 0.0001, "loss": 4.0777, "loss/crossentropy": 1.9672082662582397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19521142542362213, "step": 18218 }, { "epoch": 0.3644, "grad_norm": 1.890625, "grad_norm_var": 0.017574055989583334, "learning_rate": 0.0001, "loss": 4.0763, "loss/crossentropy": 2.0122682452201843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21602813154459, "step": 18220 }, { "epoch": 0.36444, "grad_norm": 2.078125, "grad_norm_var": 0.0169342041015625, "learning_rate": 0.0001, "loss": 4.0574, "loss/crossentropy": 2.1709738969802856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2267712950706482, "step": 18222 }, { "epoch": 0.36448, "grad_norm": 2.03125, "grad_norm_var": 0.009091949462890625, "learning_rate": 0.0001, "loss": 4.2092, "loss/crossentropy": 2.246786117553711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128008008003235, "step": 18224 }, { "epoch": 0.36452, "grad_norm": 1.9296875, "grad_norm_var": 0.0063250223795572914, "learning_rate": 0.0001, "loss": 4.1051, "loss/crossentropy": 1.686498999595642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18233107775449753, "step": 18226 }, { "epoch": 0.36456, "grad_norm": 2.078125, "grad_norm_var": 0.006786855061848959, "learning_rate": 0.0001, "loss": 4.3823, "loss/crossentropy": 2.5849136114120483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23062889277935028, "step": 18228 }, { "epoch": 0.3646, "grad_norm": 1.921875, "grad_norm_var": 0.005804189046223958, "learning_rate": 0.0001, "loss": 4.04, "loss/crossentropy": 2.068212151527405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20198806375265121, "step": 18230 }, { "epoch": 0.36464, "grad_norm": 1.9140625, "grad_norm_var": 0.00618896484375, "learning_rate": 0.0001, "loss": 3.8729, "loss/crossentropy": 1.727245271205902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17305339127779007, "step": 18232 }, { "epoch": 0.36468, "grad_norm": 2.078125, "grad_norm_var": 0.005574289957682292, "learning_rate": 0.0001, "loss": 4.0496, "loss/crossentropy": 2.0269583463668823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2060471773147583, "step": 18234 }, { "epoch": 0.36472, "grad_norm": 2.015625, "grad_norm_var": 0.004801177978515625, "learning_rate": 0.0001, "loss": 4.1707, "loss/crossentropy": 1.9194093346595764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19781427085399628, "step": 18236 }, { "epoch": 0.36476, "grad_norm": 2.078125, "grad_norm_var": 0.0055328369140625, "learning_rate": 0.0001, "loss": 4.1438, "loss/crossentropy": 1.7539438605308533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985094666481018, "step": 18238 }, { "epoch": 0.3648, "grad_norm": 1.890625, "grad_norm_var": 0.006068674723307291, "learning_rate": 0.0001, "loss": 4.1287, "loss/crossentropy": 2.1092851161956787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23365512490272522, "step": 18240 }, { "epoch": 0.36484, "grad_norm": 1.984375, "grad_norm_var": 0.00771484375, "learning_rate": 0.0001, "loss": 3.6435, "loss/crossentropy": 1.772037386894226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18215186148881912, "step": 18242 }, { "epoch": 0.36488, "grad_norm": 2.109375, "grad_norm_var": 0.007845052083333333, "learning_rate": 0.0001, "loss": 4.1017, "loss/crossentropy": 2.0108843445777893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20684240013360977, "step": 18244 }, { "epoch": 0.36492, "grad_norm": 2.109375, "grad_norm_var": 0.008348592122395833, "learning_rate": 0.0001, "loss": 4.0077, "loss/crossentropy": 2.0076091289520264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842496171593666, "step": 18246 }, { "epoch": 0.36496, "grad_norm": 1.9921875, "grad_norm_var": 0.008085123697916667, "learning_rate": 0.0001, "loss": 4.1357, "loss/crossentropy": 2.1824593544006348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19478252530097961, "step": 18248 }, { "epoch": 0.365, "grad_norm": 1.9921875, "grad_norm_var": 0.0068318684895833336, "learning_rate": 0.0001, "loss": 4.2231, "loss/crossentropy": 2.1033846139907837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21361663192510605, "step": 18250 }, { "epoch": 0.36504, "grad_norm": 1.84375, "grad_norm_var": 0.009299468994140626, "learning_rate": 0.0001, "loss": 3.7738, "loss/crossentropy": 2.142418146133423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20618261396884918, "step": 18252 }, { "epoch": 0.36508, "grad_norm": 1.9453125, "grad_norm_var": 0.008131663004557291, "learning_rate": 0.0001, "loss": 4.1037, "loss/crossentropy": 2.061814546585083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20519836992025375, "step": 18254 }, { "epoch": 0.36512, "grad_norm": 2.015625, "grad_norm_var": 0.008512115478515625, "learning_rate": 0.0001, "loss": 4.0512, "loss/crossentropy": 2.0216987133026123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956964135169983, "step": 18256 }, { "epoch": 0.36516, "grad_norm": 2.03125, "grad_norm_var": 0.006681315104166667, "learning_rate": 0.0001, "loss": 4.301, "loss/crossentropy": 2.2619231939315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21638523787260056, "step": 18258 }, { "epoch": 0.3652, "grad_norm": 2.09375, "grad_norm_var": 0.006281534830729167, "learning_rate": 0.0001, "loss": 4.165, "loss/crossentropy": 1.8058243989944458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19046328961849213, "step": 18260 }, { "epoch": 0.36524, "grad_norm": 2.046875, "grad_norm_var": 0.005639394124348958, "learning_rate": 0.0001, "loss": 4.1876, "loss/crossentropy": 2.0969032049179077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20291081070899963, "step": 18262 }, { "epoch": 0.36528, "grad_norm": 1.8125, "grad_norm_var": 0.006794993082682292, "learning_rate": 0.0001, "loss": 3.8754, "loss/crossentropy": 1.9567083716392517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19624605774879456, "step": 18264 }, { "epoch": 0.36532, "grad_norm": 1.921875, "grad_norm_var": 0.006493123372395834, "learning_rate": 0.0001, "loss": 3.9861, "loss/crossentropy": 1.9015939235687256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18771862238645554, "step": 18266 }, { "epoch": 0.36536, "grad_norm": 1.9375, "grad_norm_var": 0.017575836181640624, "learning_rate": 0.0001, "loss": 4.0288, "loss/crossentropy": 1.9432410597801208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1785590723156929, "step": 18268 }, { "epoch": 0.3654, "grad_norm": 2.0, "grad_norm_var": 0.017438761393229165, "learning_rate": 0.0001, "loss": 4.3256, "loss/crossentropy": 2.216074585914612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21463461220264435, "step": 18270 }, { "epoch": 0.36544, "grad_norm": 1.8203125, "grad_norm_var": 0.020401763916015624, "learning_rate": 0.0001, "loss": 3.7192, "loss/crossentropy": 1.5440006256103516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1621650904417038, "step": 18272 }, { "epoch": 0.36548, "grad_norm": 1.8203125, "grad_norm_var": 0.02197850545247396, "learning_rate": 0.0001, "loss": 3.9072, "loss/crossentropy": 1.9478511214256287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20275350660085678, "step": 18274 }, { "epoch": 0.36552, "grad_norm": 2.0, "grad_norm_var": 0.02217381795247396, "learning_rate": 0.0001, "loss": 4.1986, "loss/crossentropy": 2.16433984041214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21180611103773117, "step": 18276 }, { "epoch": 0.36556, "grad_norm": 2.015625, "grad_norm_var": 0.02182184855143229, "learning_rate": 0.0001, "loss": 4.0345, "loss/crossentropy": 1.9628196954727173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198894202709198, "step": 18278 }, { "epoch": 0.3656, "grad_norm": 1.8984375, "grad_norm_var": 0.020409901936848957, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 1.7818017601966858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19511079788208008, "step": 18280 }, { "epoch": 0.36564, "grad_norm": 2.15625, "grad_norm_var": 0.02315241495768229, "learning_rate": 0.0001, "loss": 4.1769, "loss/crossentropy": 2.0149444341659546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128366306424141, "step": 18282 }, { "epoch": 0.36568, "grad_norm": 1.8671875, "grad_norm_var": 0.010351308186848958, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 2.1580519676208496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21166586130857468, "step": 18284 }, { "epoch": 0.36572, "grad_norm": 1.859375, "grad_norm_var": 0.03432591756184896, "learning_rate": 0.0001, "loss": 3.9196, "loss/crossentropy": 2.308629631996155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20283856242895126, "step": 18286 }, { "epoch": 0.36576, "grad_norm": 1.9296875, "grad_norm_var": 0.03199055989583333, "learning_rate": 0.0001, "loss": 4.0587, "loss/crossentropy": 2.1761614084243774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212914377450943, "step": 18288 }, { "epoch": 0.3658, "grad_norm": 1.9453125, "grad_norm_var": 0.029842122395833334, "learning_rate": 0.0001, "loss": 4.191, "loss/crossentropy": 1.9851300120353699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1898990124464035, "step": 18290 }, { "epoch": 0.36584, "grad_norm": 1.859375, "grad_norm_var": 0.031172688802083334, "learning_rate": 0.0001, "loss": 4.028, "loss/crossentropy": 2.1123871207237244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20982655137777328, "step": 18292 }, { "epoch": 0.36588, "grad_norm": 2.0625, "grad_norm_var": 0.03831761678059896, "learning_rate": 0.0001, "loss": 4.3007, "loss/crossentropy": 2.2666051387786865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23328793793916702, "step": 18294 }, { "epoch": 0.36592, "grad_norm": 1.9765625, "grad_norm_var": 0.03904393513997396, "learning_rate": 0.0001, "loss": 3.8741, "loss/crossentropy": 1.9444871544837952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20335105061531067, "step": 18296 }, { "epoch": 0.36596, "grad_norm": 2.046875, "grad_norm_var": 0.036717732747395836, "learning_rate": 0.0001, "loss": 4.0799, "loss/crossentropy": 2.136604368686676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20185644924640656, "step": 18298 }, { "epoch": 0.366, "grad_norm": 2.59375, "grad_norm_var": 0.059024810791015625, "learning_rate": 0.0001, "loss": 4.0605, "loss/crossentropy": 1.9029142260551453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24649157375097275, "step": 18300 }, { "epoch": 0.36604, "grad_norm": 1.9296875, "grad_norm_var": 0.0400054931640625, "learning_rate": 0.0001, "loss": 4.1685, "loss/crossentropy": 2.0363988876342773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20920708030462265, "step": 18302 }, { "epoch": 0.36608, "grad_norm": 1.8984375, "grad_norm_var": 0.04248860677083333, "learning_rate": 0.0001, "loss": 3.5556, "loss/crossentropy": 1.8357294797897339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18201576173305511, "step": 18304 }, { "epoch": 0.36612, "grad_norm": 2.109375, "grad_norm_var": 0.04253743489583333, "learning_rate": 0.0001, "loss": 4.3381, "loss/crossentropy": 2.194098114967346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20245323330163956, "step": 18306 }, { "epoch": 0.36616, "grad_norm": 2.03125, "grad_norm_var": 0.0409332275390625, "learning_rate": 0.0001, "loss": 4.2501, "loss/crossentropy": 2.3840869665145874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2243075668811798, "step": 18308 }, { "epoch": 0.3662, "grad_norm": 1.9375, "grad_norm_var": 0.034398396809895836, "learning_rate": 0.0001, "loss": 4.1895, "loss/crossentropy": 2.3093236684799194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531048208475113, "step": 18310 }, { "epoch": 0.36624, "grad_norm": 1.9296875, "grad_norm_var": 0.03472493489583333, "learning_rate": 0.0001, "loss": 3.7629, "loss/crossentropy": 2.155013680458069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19654065370559692, "step": 18312 }, { "epoch": 0.36628, "grad_norm": 1.9453125, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 4.0241, "loss/crossentropy": 1.8823603391647339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18333254009485245, "step": 18314 }, { "epoch": 0.36632, "grad_norm": 2.046875, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 3.8092, "loss/crossentropy": 1.9217600226402283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18095016479492188, "step": 18316 }, { "epoch": 0.36636, "grad_norm": 1.84375, "grad_norm_var": 0.009004720052083333, "learning_rate": 0.0001, "loss": 3.8178, "loss/crossentropy": 1.7448294758796692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18972519785165787, "step": 18318 }, { "epoch": 0.3664, "grad_norm": 1.8359375, "grad_norm_var": 0.008442942301432292, "learning_rate": 0.0001, "loss": 3.9394, "loss/crossentropy": 1.8730336427688599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18396812677383423, "step": 18320 }, { "epoch": 0.36644, "grad_norm": 1.9921875, "grad_norm_var": 0.006955718994140625, "learning_rate": 0.0001, "loss": 3.8763, "loss/crossentropy": 2.075575351715088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20251936465501785, "step": 18322 }, { "epoch": 0.36648, "grad_norm": 1.96875, "grad_norm_var": 0.004131825764973959, "learning_rate": 0.0001, "loss": 4.0955, "loss/crossentropy": 2.083173990249634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003224566578865, "step": 18324 }, { "epoch": 0.36652, "grad_norm": 1.8671875, "grad_norm_var": 0.00396728515625, "learning_rate": 0.0001, "loss": 4.0245, "loss/crossentropy": 2.4401766061782837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21872484683990479, "step": 18326 }, { "epoch": 0.36656, "grad_norm": 1.9140625, "grad_norm_var": 0.0038533528645833333, "learning_rate": 0.0001, "loss": 4.0249, "loss/crossentropy": 2.2402881383895874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977379471063614, "step": 18328 }, { "epoch": 0.3666, "grad_norm": 1.921875, "grad_norm_var": 0.004650624593098959, "learning_rate": 0.0001, "loss": 4.1047, "loss/crossentropy": 2.2246369123458862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21132111549377441, "step": 18330 }, { "epoch": 0.36664, "grad_norm": 2.046875, "grad_norm_var": 0.0043365478515625, "learning_rate": 0.0001, "loss": 4.1607, "loss/crossentropy": 1.8048500418663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18298982083797455, "step": 18332 }, { "epoch": 0.36668, "grad_norm": 2.15625, "grad_norm_var": 0.006514231363932292, "learning_rate": 0.0001, "loss": 4.0987, "loss/crossentropy": 2.3368008136749268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21743982285261154, "step": 18334 }, { "epoch": 0.36672, "grad_norm": 2.234375, "grad_norm_var": 0.010188547770182292, "learning_rate": 0.0001, "loss": 4.3158, "loss/crossentropy": 2.3520134687423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22603602707386017, "step": 18336 }, { "epoch": 0.36676, "grad_norm": 2.140625, "grad_norm_var": 0.011248524983723958, "learning_rate": 0.0001, "loss": 4.0037, "loss/crossentropy": 2.0071199536323547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18584266304969788, "step": 18338 }, { "epoch": 0.3668, "grad_norm": 2.234375, "grad_norm_var": 0.019260406494140625, "learning_rate": 0.0001, "loss": 4.2787, "loss/crossentropy": 1.9329636693000793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21350353956222534, "step": 18340 }, { "epoch": 0.36684, "grad_norm": 2.0, "grad_norm_var": 0.015990193684895834, "learning_rate": 0.0001, "loss": 4.0387, "loss/crossentropy": 1.8231948018074036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064230963587761, "step": 18342 }, { "epoch": 0.36688, "grad_norm": 1.828125, "grad_norm_var": 0.01605809529622396, "learning_rate": 0.0001, "loss": 4.0888, "loss/crossentropy": 1.9928399324417114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996312066912651, "step": 18344 }, { "epoch": 0.36692, "grad_norm": 1.96875, "grad_norm_var": 0.015916951497395835, "learning_rate": 0.0001, "loss": 4.1736, "loss/crossentropy": 2.139783501625061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20407474786043167, "step": 18346 }, { "epoch": 0.36696, "grad_norm": 1.8984375, "grad_norm_var": 0.017166900634765624, "learning_rate": 0.0001, "loss": 4.0357, "loss/crossentropy": 1.8519954681396484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18404167890548706, "step": 18348 }, { "epoch": 0.367, "grad_norm": 2.015625, "grad_norm_var": 0.016361236572265625, "learning_rate": 0.0001, "loss": 4.1325, "loss/crossentropy": 2.037365674972534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2384558469057083, "step": 18350 }, { "epoch": 0.36704, "grad_norm": 1.875, "grad_norm_var": 0.015600331624348958, "learning_rate": 0.0001, "loss": 3.887, "loss/crossentropy": 2.0023937821388245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19700156897306442, "step": 18352 }, { "epoch": 0.36708, "grad_norm": 2.125, "grad_norm_var": 0.015282185872395833, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 1.829107940196991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19559869915246964, "step": 18354 }, { "epoch": 0.36712, "grad_norm": 1.9453125, "grad_norm_var": 0.006012980143229167, "learning_rate": 0.0001, "loss": 4.0962, "loss/crossentropy": 2.206624150276184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166636437177658, "step": 18356 }, { "epoch": 0.36716, "grad_norm": 1.9921875, "grad_norm_var": 0.005907185872395833, "learning_rate": 0.0001, "loss": 4.1191, "loss/crossentropy": 2.1971875429153442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20924220979213715, "step": 18358 }, { "epoch": 0.3672, "grad_norm": 2.1875, "grad_norm_var": 0.0069539388020833336, "learning_rate": 0.0001, "loss": 4.4876, "loss/crossentropy": 2.554604172706604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2638286352157593, "step": 18360 }, { "epoch": 0.36724, "grad_norm": 1.9375, "grad_norm_var": 0.007981109619140624, "learning_rate": 0.0001, "loss": 3.9076, "loss/crossentropy": 2.088346302509308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948780044913292, "step": 18362 }, { "epoch": 0.36728, "grad_norm": 1.875, "grad_norm_var": 0.0078033447265625, "learning_rate": 0.0001, "loss": 4.0101, "loss/crossentropy": 2.24162495136261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21616502106189728, "step": 18364 }, { "epoch": 0.36732, "grad_norm": 1.8515625, "grad_norm_var": 0.008567047119140626, "learning_rate": 0.0001, "loss": 3.8693, "loss/crossentropy": 2.249878764152527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312733307480812, "step": 18366 }, { "epoch": 0.36736, "grad_norm": 1.921875, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 4.0711, "loss/crossentropy": 2.0440531969070435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18561603128910065, "step": 18368 }, { "epoch": 0.3674, "grad_norm": 1.9921875, "grad_norm_var": 0.006696573893229167, "learning_rate": 0.0001, "loss": 4.2582, "loss/crossentropy": 2.2324228286743164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20741964876651764, "step": 18370 }, { "epoch": 0.36744, "grad_norm": 2.125, "grad_norm_var": 0.008634440104166667, "learning_rate": 0.0001, "loss": 4.2807, "loss/crossentropy": 2.168972373008728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215408056974411, "step": 18372 }, { "epoch": 0.36748, "grad_norm": 1.890625, "grad_norm_var": 0.008955637613932291, "learning_rate": 0.0001, "loss": 4.0341, "loss/crossentropy": 2.076392412185669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023996263742447, "step": 18374 }, { "epoch": 0.36752, "grad_norm": 2.046875, "grad_norm_var": 0.005968983968098958, "learning_rate": 0.0001, "loss": 4.2958, "loss/crossentropy": 2.0497928857803345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21541842073202133, "step": 18376 }, { "epoch": 0.36756, "grad_norm": 1.6875, "grad_norm_var": 0.0102294921875, "learning_rate": 0.0001, "loss": 3.6889, "loss/crossentropy": 1.8894451260566711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18294879794120789, "step": 18378 }, { "epoch": 0.3676, "grad_norm": 1.984375, "grad_norm_var": 0.010545857747395833, "learning_rate": 0.0001, "loss": 4.0237, "loss/crossentropy": 2.1654014587402344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18372679501771927, "step": 18380 }, { "epoch": 0.36764, "grad_norm": 2.1875, "grad_norm_var": 0.013516998291015625, "learning_rate": 0.0001, "loss": 4.0657, "loss/crossentropy": 2.203396499156952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22593200951814651, "step": 18382 }, { "epoch": 0.36768, "grad_norm": 2.03125, "grad_norm_var": 0.014090728759765626, "learning_rate": 0.0001, "loss": 3.946, "loss/crossentropy": 1.7496543526649475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1825375333428383, "step": 18384 }, { "epoch": 0.36772, "grad_norm": 1.875, "grad_norm_var": 0.014249420166015625, "learning_rate": 0.0001, "loss": 3.7667, "loss/crossentropy": 1.9488004446029663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18244057893753052, "step": 18386 }, { "epoch": 0.36776, "grad_norm": 1.9453125, "grad_norm_var": 0.01231689453125, "learning_rate": 0.0001, "loss": 3.9406, "loss/crossentropy": 1.9792875051498413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20204880088567734, "step": 18388 }, { "epoch": 0.3678, "grad_norm": 1.890625, "grad_norm_var": 0.012412261962890626, "learning_rate": 0.0001, "loss": 4.0077, "loss/crossentropy": 1.890614092350006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19333010911941528, "step": 18390 }, { "epoch": 0.36784, "grad_norm": 1.96875, "grad_norm_var": 0.011832427978515626, "learning_rate": 0.0001, "loss": 4.0615, "loss/crossentropy": 2.0901471972465515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20534122735261917, "step": 18392 }, { "epoch": 0.36788, "grad_norm": 2.109375, "grad_norm_var": 0.00716552734375, "learning_rate": 0.0001, "loss": 4.2502, "loss/crossentropy": 2.2931089401245117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130991816520691, "step": 18394 }, { "epoch": 0.36792, "grad_norm": 2.078125, "grad_norm_var": 0.0077392578125, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 2.2144237756729126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22273198515176773, "step": 18396 }, { "epoch": 0.36796, "grad_norm": 1.8046875, "grad_norm_var": 0.006624094645182292, "learning_rate": 0.0001, "loss": 3.7365, "loss/crossentropy": 1.9135422110557556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983061358332634, "step": 18398 }, { "epoch": 0.368, "grad_norm": 2.0, "grad_norm_var": 0.007456207275390625, "learning_rate": 0.0001, "loss": 3.7556, "loss/crossentropy": 1.692852795124054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1670849658548832, "step": 18400 }, { "epoch": 0.36804, "grad_norm": 2.09375, "grad_norm_var": 0.0083160400390625, "learning_rate": 0.0001, "loss": 4.0043, "loss/crossentropy": 1.908652126789093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21787738054990768, "step": 18402 }, { "epoch": 0.36808, "grad_norm": 2.015625, "grad_norm_var": 0.008432769775390625, "learning_rate": 0.0001, "loss": 4.2343, "loss/crossentropy": 2.0613549947738647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20495950430631638, "step": 18404 }, { "epoch": 0.36812, "grad_norm": 1.890625, "grad_norm_var": 0.026759592692057292, "learning_rate": 0.0001, "loss": 4.0549, "loss/crossentropy": 1.9765326976776123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18859146535396576, "step": 18406 }, { "epoch": 0.36816, "grad_norm": 1.953125, "grad_norm_var": 0.027147420247395835, "learning_rate": 0.0001, "loss": 4.0196, "loss/crossentropy": 1.8403696417808533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18926746398210526, "step": 18408 }, { "epoch": 0.3682, "grad_norm": 2.15625, "grad_norm_var": 0.029080963134765624, "learning_rate": 0.0001, "loss": 4.1859, "loss/crossentropy": 2.1792644262313843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22263357043266296, "step": 18410 }, { "epoch": 0.36824, "grad_norm": 2.0625, "grad_norm_var": 0.02910334269205729, "learning_rate": 0.0001, "loss": 4.2387, "loss/crossentropy": 2.079226016998291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20990607887506485, "step": 18412 }, { "epoch": 0.36828, "grad_norm": 1.8671875, "grad_norm_var": 0.027457427978515626, "learning_rate": 0.0001, "loss": 4.2093, "loss/crossentropy": 2.1428889632225037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22140806913375854, "step": 18414 }, { "epoch": 0.36832, "grad_norm": 1.984375, "grad_norm_var": 0.024589029947916667, "learning_rate": 0.0001, "loss": 4.3761, "loss/crossentropy": 2.1461609601974487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20594316720962524, "step": 18416 }, { "epoch": 0.36836, "grad_norm": 1.9609375, "grad_norm_var": 0.02399470011393229, "learning_rate": 0.0001, "loss": 4.1597, "loss/crossentropy": 1.980036735534668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.201970636844635, "step": 18418 }, { "epoch": 0.3684, "grad_norm": 1.9140625, "grad_norm_var": 0.024773915608723957, "learning_rate": 0.0001, "loss": 3.9847, "loss/crossentropy": 1.6273554563522339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938486322760582, "step": 18420 }, { "epoch": 0.36844, "grad_norm": 2.03125, "grad_norm_var": 0.006251780192057291, "learning_rate": 0.0001, "loss": 4.1784, "loss/crossentropy": 1.9100900292396545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18063092976808548, "step": 18422 }, { "epoch": 0.36848, "grad_norm": 1.9140625, "grad_norm_var": 0.0063555399576822914, "learning_rate": 0.0001, "loss": 4.1282, "loss/crossentropy": 2.1882822513580322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21261122822761536, "step": 18424 }, { "epoch": 0.36852, "grad_norm": 1.8359375, "grad_norm_var": 0.0046384175618489586, "learning_rate": 0.0001, "loss": 3.9556, "loss/crossentropy": 2.210257649421692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929008737206459, "step": 18426 }, { "epoch": 0.36856, "grad_norm": 2.078125, "grad_norm_var": 0.005041249593098958, "learning_rate": 0.0001, "loss": 4.1691, "loss/crossentropy": 2.0247724056243896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004971206188202, "step": 18428 }, { "epoch": 0.3686, "grad_norm": 2.015625, "grad_norm_var": 0.004133097330729167, "learning_rate": 0.0001, "loss": 4.2092, "loss/crossentropy": 1.9844316244125366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210312619805336, "step": 18430 }, { "epoch": 0.36864, "grad_norm": 1.9453125, "grad_norm_var": 0.005000559488932291, "learning_rate": 0.0001, "loss": 4.0368, "loss/crossentropy": 2.2416625022888184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21065251529216766, "step": 18432 }, { "epoch": 0.36868, "grad_norm": 2.453125, "grad_norm_var": 0.020104726155598957, "learning_rate": 0.0001, "loss": 4.4109, "loss/crossentropy": 2.0283551812171936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150302305817604, "step": 18434 }, { "epoch": 0.36872, "grad_norm": 2.015625, "grad_norm_var": 0.01968994140625, "learning_rate": 0.0001, "loss": 4.1379, "loss/crossentropy": 2.210233688354492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831902861595154, "step": 18436 }, { "epoch": 0.36876, "grad_norm": 1.9375, "grad_norm_var": 0.0205474853515625, "learning_rate": 0.0001, "loss": 3.9993, "loss/crossentropy": 2.0454147458076477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2432640790939331, "step": 18438 }, { "epoch": 0.3688, "grad_norm": 2.078125, "grad_norm_var": 0.020444488525390624, "learning_rate": 0.0001, "loss": 4.1243, "loss/crossentropy": 2.024592399597168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20496471971273422, "step": 18440 }, { "epoch": 0.36884, "grad_norm": 2.125, "grad_norm_var": 0.018822987874348957, "learning_rate": 0.0001, "loss": 4.3058, "loss/crossentropy": 2.4205459356307983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22281523793935776, "step": 18442 }, { "epoch": 0.36888, "grad_norm": 1.96875, "grad_norm_var": 0.0170806884765625, "learning_rate": 0.0001, "loss": 4.0341, "loss/crossentropy": 1.9198943376541138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916896402835846, "step": 18444 }, { "epoch": 0.36892, "grad_norm": 2.0625, "grad_norm_var": 0.0171051025390625, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 1.928059160709381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899876967072487, "step": 18446 }, { "epoch": 0.36896, "grad_norm": 1.921875, "grad_norm_var": 0.020792388916015626, "learning_rate": 0.0001, "loss": 4.3327, "loss/crossentropy": 2.346290349960327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22388508170843124, "step": 18448 }, { "epoch": 0.369, "grad_norm": 1.9140625, "grad_norm_var": 0.009946441650390625, "learning_rate": 0.0001, "loss": 3.9064, "loss/crossentropy": 2.0108938217163086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19462434202432632, "step": 18450 }, { "epoch": 0.36904, "grad_norm": 2.0, "grad_norm_var": 0.00986328125, "learning_rate": 0.0001, "loss": 4.2468, "loss/crossentropy": 1.7108886241912842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18806710839271545, "step": 18452 }, { "epoch": 0.36908, "grad_norm": 5.1875, "grad_norm_var": 0.6345499674479167, "learning_rate": 0.0001, "loss": 4.6972, "loss/crossentropy": 2.4162802696228027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3217930570244789, "step": 18454 }, { "epoch": 0.36912, "grad_norm": 2.1875, "grad_norm_var": 0.639013671875, "learning_rate": 0.0001, "loss": 3.4762, "loss/crossentropy": 1.6459838151931763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17543485760688782, "step": 18456 }, { "epoch": 0.36916, "grad_norm": 2.140625, "grad_norm_var": 0.640679677327474, "learning_rate": 0.0001, "loss": 4.1194, "loss/crossentropy": 2.2111966013908386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22191359847784042, "step": 18458 }, { "epoch": 0.3692, "grad_norm": 2.125, "grad_norm_var": 0.6325887044270834, "learning_rate": 0.0001, "loss": 4.301, "loss/crossentropy": 2.2514692544937134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21384654194116592, "step": 18460 }, { "epoch": 0.36924, "grad_norm": 2.125, "grad_norm_var": 0.63665771484375, "learning_rate": 0.0001, "loss": 4.307, "loss/crossentropy": 2.097872793674469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21747223287820816, "step": 18462 }, { "epoch": 0.36928, "grad_norm": 2.03125, "grad_norm_var": 0.6418690999348958, "learning_rate": 0.0001, "loss": 4.1033, "loss/crossentropy": 2.1611807346343994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042219042778015, "step": 18464 }, { "epoch": 0.36932, "grad_norm": 2.15625, "grad_norm_var": 0.62939453125, "learning_rate": 0.0001, "loss": 3.7871, "loss/crossentropy": 1.8941562175750732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192266546189785, "step": 18466 }, { "epoch": 0.36936, "grad_norm": 1.921875, "grad_norm_var": 0.72021484375, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 1.8930317163467407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19436348974704742, "step": 18468 }, { "epoch": 0.3694, "grad_norm": 1.9140625, "grad_norm_var": 0.15979588826497396, "learning_rate": 0.0001, "loss": 4.1046, "loss/crossentropy": 2.1041005849838257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21314333379268646, "step": 18470 }, { "epoch": 0.36944, "grad_norm": 1.9453125, "grad_norm_var": 0.15750732421875, "learning_rate": 0.0001, "loss": 4.2096, "loss/crossentropy": 1.8747637867927551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19172174483537674, "step": 18472 }, { "epoch": 0.36948, "grad_norm": 1.9453125, "grad_norm_var": 0.15751113891601562, "learning_rate": 0.0001, "loss": 3.9809, "loss/crossentropy": 2.1883610486984253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19927512109279633, "step": 18474 }, { "epoch": 0.36952, "grad_norm": 1.9921875, "grad_norm_var": 0.1606353759765625, "learning_rate": 0.0001, "loss": 3.9303, "loss/crossentropy": 1.972772240638733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17948149144649506, "step": 18476 }, { "epoch": 0.36956, "grad_norm": 2.0625, "grad_norm_var": 0.14570490519205728, "learning_rate": 0.0001, "loss": 4.1246, "loss/crossentropy": 1.8912597298622131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18716244399547577, "step": 18478 }, { "epoch": 0.3696, "grad_norm": 1.9140625, "grad_norm_var": 0.1458740234375, "learning_rate": 0.0001, "loss": 4.0816, "loss/crossentropy": 2.044828712940216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076854333281517, "step": 18480 }, { "epoch": 0.36964, "grad_norm": 2.03125, "grad_norm_var": 0.14629618326822916, "learning_rate": 0.0001, "loss": 4.2007, "loss/crossentropy": 2.0936968326568604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22010911256074905, "step": 18482 }, { "epoch": 0.36968, "grad_norm": 1.96875, "grad_norm_var": 0.0027414957682291665, "learning_rate": 0.0001, "loss": 4.08, "loss/crossentropy": 2.2678394317626953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073991820216179, "step": 18484 }, { "epoch": 0.36972, "grad_norm": 2.078125, "grad_norm_var": 0.0031064351399739585, "learning_rate": 0.0001, "loss": 3.9361, "loss/crossentropy": 2.281362295150757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015322744846344, "step": 18486 }, { "epoch": 0.36976, "grad_norm": 2.015625, "grad_norm_var": 0.0034739176432291665, "learning_rate": 0.0001, "loss": 4.2318, "loss/crossentropy": 2.0029674768447876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155751883983612, "step": 18488 }, { "epoch": 0.3698, "grad_norm": 1.984375, "grad_norm_var": 0.003525543212890625, "learning_rate": 0.0001, "loss": 4.3761, "loss/crossentropy": 2.370519280433655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308923304080963, "step": 18490 }, { "epoch": 0.36984, "grad_norm": 2.09375, "grad_norm_var": 0.004109446207682292, "learning_rate": 0.0001, "loss": 4.3137, "loss/crossentropy": 1.9416582584381104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18809420615434647, "step": 18492 }, { "epoch": 0.36988, "grad_norm": 1.9765625, "grad_norm_var": 0.003885650634765625, "learning_rate": 0.0001, "loss": 4.1381, "loss/crossentropy": 2.065304160118103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19029072672128677, "step": 18494 }, { "epoch": 0.36992, "grad_norm": 1.9296875, "grad_norm_var": 0.0036516825358072916, "learning_rate": 0.0001, "loss": 4.1366, "loss/crossentropy": 1.9798340201377869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1716388538479805, "step": 18496 }, { "epoch": 0.36996, "grad_norm": 1.984375, "grad_norm_var": 0.0035540262858072915, "learning_rate": 0.0001, "loss": 4.2321, "loss/crossentropy": 2.336732864379883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19805839657783508, "step": 18498 }, { "epoch": 0.37, "grad_norm": 1.984375, "grad_norm_var": 0.0033444722493489584, "learning_rate": 0.0001, "loss": 4.08, "loss/crossentropy": 1.9556902050971985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19576922804117203, "step": 18500 }, { "epoch": 0.37004, "grad_norm": 1.796875, "grad_norm_var": 0.006581370035807292, "learning_rate": 0.0001, "loss": 3.7281, "loss/crossentropy": 2.0062427520751953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19548972696065903, "step": 18502 }, { "epoch": 0.37008, "grad_norm": 1.9375, "grad_norm_var": 0.0061419169108072914, "learning_rate": 0.0001, "loss": 4.0954, "loss/crossentropy": 2.1030293703079224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22147215902805328, "step": 18504 }, { "epoch": 0.37012, "grad_norm": 2.046875, "grad_norm_var": 0.005163319905598958, "learning_rate": 0.0001, "loss": 3.9545, "loss/crossentropy": 1.7618860006332397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17934025079011917, "step": 18506 }, { "epoch": 0.37016, "grad_norm": 2.109375, "grad_norm_var": 0.005454254150390625, "learning_rate": 0.0001, "loss": 4.239, "loss/crossentropy": 2.4424854516983032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23131447285413742, "step": 18508 }, { "epoch": 0.3702, "grad_norm": 1.96875, "grad_norm_var": 0.0076904296875, "learning_rate": 0.0001, "loss": 4.0365, "loss/crossentropy": 2.125267446041107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19127625226974487, "step": 18510 }, { "epoch": 0.37024, "grad_norm": 1.828125, "grad_norm_var": 0.008506011962890626, "learning_rate": 0.0001, "loss": 3.9498, "loss/crossentropy": 2.2222214937210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204838365316391, "step": 18512 }, { "epoch": 0.37028, "grad_norm": 1.8984375, "grad_norm_var": 0.009056599934895833, "learning_rate": 0.0001, "loss": 4.1012, "loss/crossentropy": 2.1930031776428223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21769095957279205, "step": 18514 }, { "epoch": 0.37032, "grad_norm": 1.84375, "grad_norm_var": 0.010814412434895834, "learning_rate": 0.0001, "loss": 4.0068, "loss/crossentropy": 2.078152060508728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19528701901435852, "step": 18516 }, { "epoch": 0.37036, "grad_norm": 1.9296875, "grad_norm_var": 0.009016927083333333, "learning_rate": 0.0001, "loss": 4.0135, "loss/crossentropy": 1.9786349534988403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19193317741155624, "step": 18518 }, { "epoch": 0.3704, "grad_norm": 2.015625, "grad_norm_var": 0.0119293212890625, "learning_rate": 0.0001, "loss": 4.1458, "loss/crossentropy": 2.279360294342041, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20865458250045776, "step": 18520 }, { "epoch": 0.37044, "grad_norm": 2.078125, "grad_norm_var": 0.012548828125, "learning_rate": 0.0001, "loss": 4.195, "loss/crossentropy": 2.0668978691101074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19060996919870377, "step": 18522 }, { "epoch": 0.37048, "grad_norm": 1.90625, "grad_norm_var": 0.011116536458333333, "learning_rate": 0.0001, "loss": 4.2661, "loss/crossentropy": 2.4433913230895996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2263498529791832, "step": 18524 }, { "epoch": 0.37052, "grad_norm": 1.9765625, "grad_norm_var": 0.009049224853515624, "learning_rate": 0.0001, "loss": 3.7953, "loss/crossentropy": 1.7785582542419434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18417692929506302, "step": 18526 }, { "epoch": 0.37056, "grad_norm": 2.015625, "grad_norm_var": 0.008499908447265624, "learning_rate": 0.0001, "loss": 4.2958, "loss/crossentropy": 1.9753797054290771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1898263543844223, "step": 18528 }, { "epoch": 0.3706, "grad_norm": 1.9453125, "grad_norm_var": 0.007696278889973958, "learning_rate": 0.0001, "loss": 4.1945, "loss/crossentropy": 2.302052319049835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21289903670549393, "step": 18530 }, { "epoch": 0.37064, "grad_norm": 1.8828125, "grad_norm_var": 0.007523600260416667, "learning_rate": 0.0001, "loss": 4.1132, "loss/crossentropy": 2.3275071382522583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265043556690216, "step": 18532 }, { "epoch": 0.37068, "grad_norm": 2.046875, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 4.2004, "loss/crossentropy": 2.3155715465545654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20101473480463028, "step": 18534 }, { "epoch": 0.37072, "grad_norm": 1.8359375, "grad_norm_var": 0.0075681050618489586, "learning_rate": 0.0001, "loss": 3.8546, "loss/crossentropy": 2.0893908739089966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18266596645116806, "step": 18536 }, { "epoch": 0.37076, "grad_norm": 1.9453125, "grad_norm_var": 0.006546783447265625, "learning_rate": 0.0001, "loss": 4.297, "loss/crossentropy": 2.075433909893036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20270193368196487, "step": 18538 }, { "epoch": 0.3708, "grad_norm": 1.90625, "grad_norm_var": 0.006605784098307292, "learning_rate": 0.0001, "loss": 4.186, "loss/crossentropy": 2.2795485258102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21547292172908783, "step": 18540 }, { "epoch": 0.37084, "grad_norm": 1.859375, "grad_norm_var": 0.006956990559895833, "learning_rate": 0.0001, "loss": 4.0351, "loss/crossentropy": 2.0038467049598694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19927001744508743, "step": 18542 }, { "epoch": 0.37088, "grad_norm": 1.9921875, "grad_norm_var": 0.0056111653645833336, "learning_rate": 0.0001, "loss": 3.979, "loss/crossentropy": 1.8877951502799988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19283732771873474, "step": 18544 }, { "epoch": 0.37092, "grad_norm": 1.90625, "grad_norm_var": 0.0065305074055989586, "learning_rate": 0.0001, "loss": 4.0169, "loss/crossentropy": 1.8111292719841003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20587582886219025, "step": 18546 }, { "epoch": 0.37096, "grad_norm": 1.9609375, "grad_norm_var": 0.004548136393229167, "learning_rate": 0.0001, "loss": 4.0766, "loss/crossentropy": 2.12824147939682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996314972639084, "step": 18548 }, { "epoch": 0.371, "grad_norm": 1.953125, "grad_norm_var": 0.0035845438639322915, "learning_rate": 0.0001, "loss": 3.9549, "loss/crossentropy": 2.22495698928833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20922152698040009, "step": 18550 }, { "epoch": 0.37104, "grad_norm": 1.9609375, "grad_norm_var": 0.003110504150390625, "learning_rate": 0.0001, "loss": 4.2245, "loss/crossentropy": 2.1434414386749268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2070355862379074, "step": 18552 }, { "epoch": 0.37108, "grad_norm": 1.859375, "grad_norm_var": 0.003979237874348959, "learning_rate": 0.0001, "loss": 4.0359, "loss/crossentropy": 2.248233437538147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21400754898786545, "step": 18554 }, { "epoch": 0.37112, "grad_norm": 2.0, "grad_norm_var": 0.0051513671875, "learning_rate": 0.0001, "loss": 4.2072, "loss/crossentropy": 2.246641755104065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22847777605056763, "step": 18556 }, { "epoch": 0.37116, "grad_norm": 1.9765625, "grad_norm_var": 0.004654693603515625, "learning_rate": 0.0001, "loss": 4.0805, "loss/crossentropy": 2.057798206806183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23668289184570312, "step": 18558 }, { "epoch": 0.3712, "grad_norm": 2.0, "grad_norm_var": 0.004587554931640625, "learning_rate": 0.0001, "loss": 3.8454, "loss/crossentropy": 1.7570822834968567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1820850819349289, "step": 18560 }, { "epoch": 0.37124, "grad_norm": 2.390625, "grad_norm_var": 0.015372467041015626, "learning_rate": 0.0001, "loss": 4.0077, "loss/crossentropy": 1.7747303247451782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1704205796122551, "step": 18562 }, { "epoch": 0.37128, "grad_norm": 1.9921875, "grad_norm_var": 0.015632120768229167, "learning_rate": 0.0001, "loss": 4.0611, "loss/crossentropy": 2.3060861825942993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20002726465463638, "step": 18564 }, { "epoch": 0.37132, "grad_norm": 1.921875, "grad_norm_var": 0.015242258707682291, "learning_rate": 0.0001, "loss": 4.258, "loss/crossentropy": 2.2742475271224976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22524481266736984, "step": 18566 }, { "epoch": 0.37136, "grad_norm": 1.859375, "grad_norm_var": 0.01647923787434896, "learning_rate": 0.0001, "loss": 3.9816, "loss/crossentropy": 1.7929689288139343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1819063350558281, "step": 18568 }, { "epoch": 0.3714, "grad_norm": 1.9140625, "grad_norm_var": 0.015718587239583335, "learning_rate": 0.0001, "loss": 4.2487, "loss/crossentropy": 2.4505950212478638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22857815772294998, "step": 18570 }, { "epoch": 0.37144, "grad_norm": 1.828125, "grad_norm_var": 0.017658487955729166, "learning_rate": 0.0001, "loss": 3.5856, "loss/crossentropy": 1.640372097492218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16958069056272507, "step": 18572 }, { "epoch": 0.37148, "grad_norm": 1.984375, "grad_norm_var": 0.01916681925455729, "learning_rate": 0.0001, "loss": 4.1601, "loss/crossentropy": 2.1556472778320312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20749886333942413, "step": 18574 }, { "epoch": 0.37152, "grad_norm": 2.015625, "grad_norm_var": 0.01846491495768229, "learning_rate": 0.0001, "loss": 4.1579, "loss/crossentropy": 2.1131449937820435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20596522092819214, "step": 18576 }, { "epoch": 0.37156, "grad_norm": 1.84375, "grad_norm_var": 0.008998362223307292, "learning_rate": 0.0001, "loss": 3.8036, "loss/crossentropy": 1.8966050148010254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289513677358627, "step": 18578 }, { "epoch": 0.3716, "grad_norm": 2.109375, "grad_norm_var": 0.010306803385416667, "learning_rate": 0.0001, "loss": 4.1883, "loss/crossentropy": 2.1016936898231506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917622685432434, "step": 18580 }, { "epoch": 0.37164, "grad_norm": 2.265625, "grad_norm_var": 0.01512451171875, "learning_rate": 0.0001, "loss": 4.2596, "loss/crossentropy": 2.289618492126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033960521221161, "step": 18582 }, { "epoch": 0.37168, "grad_norm": 1.9765625, "grad_norm_var": 0.020287068684895833, "learning_rate": 0.0001, "loss": 4.0441, "loss/crossentropy": 2.205715775489807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20015332102775574, "step": 18584 }, { "epoch": 0.37172, "grad_norm": 2.171875, "grad_norm_var": 0.02072118123372396, "learning_rate": 0.0001, "loss": 4.0834, "loss/crossentropy": 2.3037471771240234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2424507886171341, "step": 18586 }, { "epoch": 0.37176, "grad_norm": 1.890625, "grad_norm_var": 0.016686757405598957, "learning_rate": 0.0001, "loss": 4.0718, "loss/crossentropy": 2.1757054328918457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20777830481529236, "step": 18588 }, { "epoch": 0.3718, "grad_norm": 2.03125, "grad_norm_var": 0.015860748291015626, "learning_rate": 0.0001, "loss": 4.2905, "loss/crossentropy": 2.3257133960723877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22848188877105713, "step": 18590 }, { "epoch": 0.37184, "grad_norm": 1.9140625, "grad_norm_var": 0.017179107666015624, "learning_rate": 0.0001, "loss": 3.9132, "loss/crossentropy": 2.3375465869903564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21214719116687775, "step": 18592 }, { "epoch": 0.37188, "grad_norm": 1.9609375, "grad_norm_var": 0.0162750244140625, "learning_rate": 0.0001, "loss": 4.507, "loss/crossentropy": 2.1896092891693115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2303280457854271, "step": 18594 }, { "epoch": 0.37192, "grad_norm": 2.0, "grad_norm_var": 0.016287994384765626, "learning_rate": 0.0001, "loss": 3.9078, "loss/crossentropy": 1.8913645148277283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17311514914035797, "step": 18596 }, { "epoch": 0.37196, "grad_norm": 2.03125, "grad_norm_var": 0.012442779541015626, "learning_rate": 0.0001, "loss": 4.2212, "loss/crossentropy": 2.5278197526931763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23568174242973328, "step": 18598 }, { "epoch": 0.372, "grad_norm": 1.9375, "grad_norm_var": 0.0101959228515625, "learning_rate": 0.0001, "loss": 4.1568, "loss/crossentropy": 2.073936700820923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21885155141353607, "step": 18600 }, { "epoch": 0.37204, "grad_norm": 2.265625, "grad_norm_var": 0.012684885660807292, "learning_rate": 0.0001, "loss": 4.388, "loss/crossentropy": 2.162986159324646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012084275484085, "step": 18602 }, { "epoch": 0.37208, "grad_norm": 1.921875, "grad_norm_var": 0.013108062744140624, "learning_rate": 0.0001, "loss": 3.9659, "loss/crossentropy": 1.9332863092422485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18284077942371368, "step": 18604 }, { "epoch": 0.37212, "grad_norm": 1.9140625, "grad_norm_var": 0.013809967041015624, "learning_rate": 0.0001, "loss": 4.0279, "loss/crossentropy": 2.083172380924225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186289481818676, "step": 18606 }, { "epoch": 0.37216, "grad_norm": 1.96875, "grad_norm_var": 0.013588205973307291, "learning_rate": 0.0001, "loss": 3.9932, "loss/crossentropy": 2.100313901901245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977698802947998, "step": 18608 }, { "epoch": 0.3722, "grad_norm": 1.890625, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 4.1086, "loss/crossentropy": 2.2336236238479614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21973469108343124, "step": 18610 }, { "epoch": 0.37224, "grad_norm": 1.984375, "grad_norm_var": 0.017048136393229166, "learning_rate": 0.0001, "loss": 3.9422, "loss/crossentropy": 2.0263352394104004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19232206791639328, "step": 18612 }, { "epoch": 0.37228, "grad_norm": 1.9765625, "grad_norm_var": 0.01727472941080729, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 2.0719032287597656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22896382957696915, "step": 18614 }, { "epoch": 0.37232, "grad_norm": 1.84375, "grad_norm_var": 0.015400950113932292, "learning_rate": 0.0001, "loss": 3.881, "loss/crossentropy": 1.9514707326889038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899290755391121, "step": 18616 }, { "epoch": 0.37236, "grad_norm": 1.921875, "grad_norm_var": 0.010172526041666666, "learning_rate": 0.0001, "loss": 4.0163, "loss/crossentropy": 2.0810786485671997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983262002468109, "step": 18618 }, { "epoch": 0.3724, "grad_norm": 1.921875, "grad_norm_var": 0.010302734375, "learning_rate": 0.0001, "loss": 4.1311, "loss/crossentropy": 1.823366403579712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18820683658123016, "step": 18620 }, { "epoch": 0.37244, "grad_norm": 2.4375, "grad_norm_var": 0.023933919270833333, "learning_rate": 0.0001, "loss": 4.2361, "loss/crossentropy": 2.2814120054244995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19500700384378433, "step": 18622 }, { "epoch": 0.37248, "grad_norm": 1.9765625, "grad_norm_var": 0.023339589436848957, "learning_rate": 0.0001, "loss": 3.9548, "loss/crossentropy": 1.5761349201202393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17777466028928757, "step": 18624 }, { "epoch": 0.37252, "grad_norm": 1.9375, "grad_norm_var": 0.018853505452473957, "learning_rate": 0.0001, "loss": 3.9134, "loss/crossentropy": 1.8996286988258362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21907050907611847, "step": 18626 }, { "epoch": 0.37256, "grad_norm": 1.9296875, "grad_norm_var": 0.020869700113932292, "learning_rate": 0.0001, "loss": 4.2361, "loss/crossentropy": 1.6650620698928833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18808475136756897, "step": 18628 }, { "epoch": 0.3726, "grad_norm": 2.125, "grad_norm_var": 0.02156550089518229, "learning_rate": 0.0001, "loss": 4.4004, "loss/crossentropy": 2.2887942790985107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20416373759508133, "step": 18630 }, { "epoch": 0.37264, "grad_norm": 2.03125, "grad_norm_var": 0.018629709879557293, "learning_rate": 0.0001, "loss": 3.9606, "loss/crossentropy": 1.9479430317878723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18489708751440048, "step": 18632 }, { "epoch": 0.37268, "grad_norm": 2.0, "grad_norm_var": 0.016676584879557293, "learning_rate": 0.0001, "loss": 4.3041, "loss/crossentropy": 2.2156901359558105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20997263491153717, "step": 18634 }, { "epoch": 0.37272, "grad_norm": 2.0, "grad_norm_var": 0.015950520833333332, "learning_rate": 0.0001, "loss": 4.1619, "loss/crossentropy": 2.50583279132843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23347270488739014, "step": 18636 }, { "epoch": 0.37276, "grad_norm": 1.8203125, "grad_norm_var": 0.008650716145833333, "learning_rate": 0.0001, "loss": 3.8241, "loss/crossentropy": 1.7688243985176086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18601343035697937, "step": 18638 }, { "epoch": 0.3728, "grad_norm": 2.109375, "grad_norm_var": 0.009439849853515625, "learning_rate": 0.0001, "loss": 4.2241, "loss/crossentropy": 2.3786444664001465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2287142351269722, "step": 18640 }, { "epoch": 0.37284, "grad_norm": 1.9609375, "grad_norm_var": 0.009479777018229166, "learning_rate": 0.0001, "loss": 4.243, "loss/crossentropy": 2.2686339616775513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21086719632148743, "step": 18642 }, { "epoch": 0.37288, "grad_norm": 2.0, "grad_norm_var": 0.007490793863932292, "learning_rate": 0.0001, "loss": 3.8341, "loss/crossentropy": 1.9386130571365356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289222359657288, "step": 18644 }, { "epoch": 0.37292, "grad_norm": 1.953125, "grad_norm_var": 0.006819407145182292, "learning_rate": 0.0001, "loss": 4.3357, "loss/crossentropy": 2.3655601739883423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207036018371582, "step": 18646 }, { "epoch": 0.37296, "grad_norm": 1.9375, "grad_norm_var": 0.007100168863932292, "learning_rate": 0.0001, "loss": 3.9039, "loss/crossentropy": 1.9600831270217896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18018172681331635, "step": 18648 }, { "epoch": 0.373, "grad_norm": 2.515625, "grad_norm_var": 0.024873860677083335, "learning_rate": 0.0001, "loss": 3.9492, "loss/crossentropy": 1.86528480052948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19763591885566711, "step": 18650 }, { "epoch": 0.37304, "grad_norm": 1.984375, "grad_norm_var": 0.024925740559895833, "learning_rate": 0.0001, "loss": 4.07, "loss/crossentropy": 2.0938327312469482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051714062690735, "step": 18652 }, { "epoch": 0.37308, "grad_norm": 1.9296875, "grad_norm_var": 0.022454579671223957, "learning_rate": 0.0001, "loss": 3.878, "loss/crossentropy": 1.964399516582489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20198997855186462, "step": 18654 }, { "epoch": 0.37312, "grad_norm": 2.09375, "grad_norm_var": 0.0245849609375, "learning_rate": 0.0001, "loss": 3.9929, "loss/crossentropy": 2.0186336040496826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19310183823108673, "step": 18656 }, { "epoch": 0.37316, "grad_norm": 1.921875, "grad_norm_var": 0.025052642822265624, "learning_rate": 0.0001, "loss": 3.8569, "loss/crossentropy": 1.8805240392684937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20392487198114395, "step": 18658 }, { "epoch": 0.3732, "grad_norm": 1.90625, "grad_norm_var": 0.025480143229166665, "learning_rate": 0.0001, "loss": 4.0306, "loss/crossentropy": 2.45102322101593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2161283940076828, "step": 18660 }, { "epoch": 0.37324, "grad_norm": 1.921875, "grad_norm_var": 0.025679270426432293, "learning_rate": 0.0001, "loss": 4.1015, "loss/crossentropy": 2.2971357107162476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23155176639556885, "step": 18662 }, { "epoch": 0.37328, "grad_norm": 1.953125, "grad_norm_var": 0.025614166259765626, "learning_rate": 0.0001, "loss": 4.285, "loss/crossentropy": 2.3736867904663086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22897879034280777, "step": 18664 }, { "epoch": 0.37332, "grad_norm": 2.0625, "grad_norm_var": 0.005500284830729166, "learning_rate": 0.0001, "loss": 3.9884, "loss/crossentropy": 1.5138108134269714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1526506468653679, "step": 18666 }, { "epoch": 0.37336, "grad_norm": 1.859375, "grad_norm_var": 0.0061948140462239586, "learning_rate": 0.0001, "loss": 3.9507, "loss/crossentropy": 2.415435791015625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21689960360527039, "step": 18668 }, { "epoch": 0.3734, "grad_norm": 1.90625, "grad_norm_var": 0.006306966145833333, "learning_rate": 0.0001, "loss": 4.014, "loss/crossentropy": 2.1235941648483276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20921900868415833, "step": 18670 }, { "epoch": 0.37344, "grad_norm": 2.015625, "grad_norm_var": 0.004428863525390625, "learning_rate": 0.0001, "loss": 4.2549, "loss/crossentropy": 2.3612579703330994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21255160123109818, "step": 18672 }, { "epoch": 0.37348, "grad_norm": 2.09375, "grad_norm_var": 0.054351552327473955, "learning_rate": 0.0001, "loss": 4.1722, "loss/crossentropy": 2.23550283908844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18890902400016785, "step": 18674 }, { "epoch": 0.37352, "grad_norm": 1.9140625, "grad_norm_var": 0.05468317667643229, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 1.7998243570327759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19070486724376678, "step": 18676 }, { "epoch": 0.37356, "grad_norm": 1.984375, "grad_norm_var": 0.055214182535807295, "learning_rate": 0.0001, "loss": 4.151, "loss/crossentropy": 1.892760992050171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19817107915878296, "step": 18678 }, { "epoch": 0.3736, "grad_norm": 1.9140625, "grad_norm_var": 0.055757395426432294, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 1.9619091153144836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21299303323030472, "step": 18680 }, { "epoch": 0.37364, "grad_norm": 1.90625, "grad_norm_var": 0.057889556884765624, "learning_rate": 0.0001, "loss": 3.8419, "loss/crossentropy": 2.0857014656066895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20634424686431885, "step": 18682 }, { "epoch": 0.37368, "grad_norm": 1.796875, "grad_norm_var": 0.05814793904622396, "learning_rate": 0.0001, "loss": 3.9589, "loss/crossentropy": 2.1093358397483826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933949589729309, "step": 18684 }, { "epoch": 0.37372, "grad_norm": 1.9375, "grad_norm_var": 0.05987548828125, "learning_rate": 0.0001, "loss": 3.9537, "loss/crossentropy": 2.1040873527526855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061748430132866, "step": 18686 }, { "epoch": 0.37376, "grad_norm": 1.9609375, "grad_norm_var": 0.059081776936848955, "learning_rate": 0.0001, "loss": 4.1676, "loss/crossentropy": 2.4795751571655273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075038179755211, "step": 18688 }, { "epoch": 0.3738, "grad_norm": 1.8515625, "grad_norm_var": 0.006599934895833334, "learning_rate": 0.0001, "loss": 3.9992, "loss/crossentropy": 1.864591360092163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17119919508695602, "step": 18690 }, { "epoch": 0.37384, "grad_norm": 1.9296875, "grad_norm_var": 0.007063547770182292, "learning_rate": 0.0001, "loss": 3.7565, "loss/crossentropy": 1.6705753207206726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19292957335710526, "step": 18692 }, { "epoch": 0.37388, "grad_norm": 2.03125, "grad_norm_var": 0.004209136962890625, "learning_rate": 0.0001, "loss": 4.0448, "loss/crossentropy": 2.16109561920166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515639334917068, "step": 18694 }, { "epoch": 0.37392, "grad_norm": 2.28125, "grad_norm_var": 0.014212799072265626, "learning_rate": 0.0001, "loss": 4.1683, "loss/crossentropy": 1.771731436252594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20304742455482483, "step": 18696 }, { "epoch": 0.37396, "grad_norm": 1.8203125, "grad_norm_var": 0.014330037434895833, "learning_rate": 0.0001, "loss": 3.9636, "loss/crossentropy": 2.046182096004486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976921409368515, "step": 18698 }, { "epoch": 0.374, "grad_norm": 1.78125, "grad_norm_var": 0.0148345947265625, "learning_rate": 0.0001, "loss": 4.0257, "loss/crossentropy": 2.0343292355537415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19037891179323196, "step": 18700 }, { "epoch": 0.37404, "grad_norm": 1.859375, "grad_norm_var": 0.014782460530598958, "learning_rate": 0.0001, "loss": 4.1727, "loss/crossentropy": 2.034367859363556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20182596892118454, "step": 18702 }, { "epoch": 0.37408, "grad_norm": 1.9609375, "grad_norm_var": 0.015616861979166667, "learning_rate": 0.0001, "loss": 3.966, "loss/crossentropy": 2.048890709877014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19922567903995514, "step": 18704 }, { "epoch": 0.37412, "grad_norm": 2.03125, "grad_norm_var": 0.0163970947265625, "learning_rate": 0.0001, "loss": 4.0864, "loss/crossentropy": 1.8759222626686096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19452380388975143, "step": 18706 }, { "epoch": 0.37416, "grad_norm": 1.9609375, "grad_norm_var": 0.015681966145833334, "learning_rate": 0.0001, "loss": 3.9964, "loss/crossentropy": 1.693844199180603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16944261640310287, "step": 18708 }, { "epoch": 0.3742, "grad_norm": 1.9765625, "grad_norm_var": 0.01727879842122396, "learning_rate": 0.0001, "loss": 4.293, "loss/crossentropy": 2.266264319419861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103852555155754, "step": 18710 }, { "epoch": 0.37424, "grad_norm": 1.9296875, "grad_norm_var": 0.008250935872395834, "learning_rate": 0.0001, "loss": 3.8547, "loss/crossentropy": 2.029367506504059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18859465420246124, "step": 18712 }, { "epoch": 0.37428, "grad_norm": 2.640625, "grad_norm_var": 0.039184315999348955, "learning_rate": 0.0001, "loss": 4.1141, "loss/crossentropy": 1.9427701234817505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960265040397644, "step": 18714 }, { "epoch": 0.37432, "grad_norm": 2.078125, "grad_norm_var": 0.03737360636393229, "learning_rate": 0.0001, "loss": 4.3087, "loss/crossentropy": 2.2656137943267822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22287357598543167, "step": 18716 }, { "epoch": 0.37436, "grad_norm": 1.9609375, "grad_norm_var": 0.0365386962890625, "learning_rate": 0.0001, "loss": 4.163, "loss/crossentropy": 2.0447250604629517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20505183935165405, "step": 18718 }, { "epoch": 0.3744, "grad_norm": 2.03125, "grad_norm_var": 0.04125137329101562, "learning_rate": 0.0001, "loss": 3.6507, "loss/crossentropy": 1.8176262378692627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17877614498138428, "step": 18720 }, { "epoch": 0.37444, "grad_norm": 1.890625, "grad_norm_var": 0.04017333984375, "learning_rate": 0.0001, "loss": 3.9368, "loss/crossentropy": 1.8362378478050232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18403278291225433, "step": 18722 }, { "epoch": 0.37448, "grad_norm": 2.140625, "grad_norm_var": 0.04104410807291667, "learning_rate": 0.0001, "loss": 4.4322, "loss/crossentropy": 2.43450927734375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21623602509498596, "step": 18724 }, { "epoch": 0.37452, "grad_norm": 2.09375, "grad_norm_var": 0.04133275349934896, "learning_rate": 0.0001, "loss": 4.3905, "loss/crossentropy": 2.227471947669983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000503957271576, "step": 18726 }, { "epoch": 0.37456, "grad_norm": 1.8046875, "grad_norm_var": 0.04317626953125, "learning_rate": 0.0001, "loss": 3.9138, "loss/crossentropy": 1.9392182230949402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19422952830791473, "step": 18728 }, { "epoch": 0.3746, "grad_norm": 1.96875, "grad_norm_var": 0.014469401041666666, "learning_rate": 0.0001, "loss": 4.1284, "loss/crossentropy": 2.0168241262435913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1774912029504776, "step": 18730 }, { "epoch": 0.37464, "grad_norm": 1.8984375, "grad_norm_var": 0.012859853108723958, "learning_rate": 0.0001, "loss": 3.8982, "loss/crossentropy": 1.6861125230789185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16301175951957703, "step": 18732 }, { "epoch": 0.37468, "grad_norm": 2.09375, "grad_norm_var": 0.014111328125, "learning_rate": 0.0001, "loss": 4.2366, "loss/crossentropy": 2.2991716861724854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21094633638858795, "step": 18734 }, { "epoch": 0.37472, "grad_norm": 1.96875, "grad_norm_var": 0.008115386962890625, "learning_rate": 0.0001, "loss": 4.0184, "loss/crossentropy": 1.8746486902236938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19248205423355103, "step": 18736 }, { "epoch": 0.37476, "grad_norm": 2.15625, "grad_norm_var": 0.009642537434895833, "learning_rate": 0.0001, "loss": 4.1886, "loss/crossentropy": 2.137068212032318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996043175458908, "step": 18738 }, { "epoch": 0.3748, "grad_norm": 1.9296875, "grad_norm_var": 0.015636952718098958, "learning_rate": 0.0001, "loss": 4.1503, "loss/crossentropy": 1.724601149559021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23018527776002884, "step": 18740 }, { "epoch": 0.37484, "grad_norm": 1.9140625, "grad_norm_var": 0.015584309895833334, "learning_rate": 0.0001, "loss": 4.1593, "loss/crossentropy": 2.1407381296157837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21157334744930267, "step": 18742 }, { "epoch": 0.37488, "grad_norm": 1.9453125, "grad_norm_var": 0.013090006510416667, "learning_rate": 0.0001, "loss": 4.0371, "loss/crossentropy": 2.2977999448776245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19883693754673004, "step": 18744 }, { "epoch": 0.37492, "grad_norm": 2.078125, "grad_norm_var": 0.018155924479166665, "learning_rate": 0.0001, "loss": 3.8912, "loss/crossentropy": 2.096716046333313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21084215492010117, "step": 18746 }, { "epoch": 0.37496, "grad_norm": 1.9453125, "grad_norm_var": 0.019254302978515624, "learning_rate": 0.0001, "loss": 3.8508, "loss/crossentropy": 1.7292688488960266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18280881643295288, "step": 18748 }, { "epoch": 0.375, "grad_norm": 1.984375, "grad_norm_var": 0.0191314697265625, "learning_rate": 0.0001, "loss": 3.9798, "loss/crossentropy": 1.7562988996505737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889612227678299, "step": 18750 }, { "epoch": 0.37504, "grad_norm": 1.9453125, "grad_norm_var": 0.02008641560872396, "learning_rate": 0.0001, "loss": 4.1498, "loss/crossentropy": 2.1730951070785522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20177915692329407, "step": 18752 }, { "epoch": 0.37508, "grad_norm": 2.0625, "grad_norm_var": 0.018161773681640625, "learning_rate": 0.0001, "loss": 4.0316, "loss/crossentropy": 1.7671055793762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17018838971853256, "step": 18754 }, { "epoch": 0.37512, "grad_norm": 2.015625, "grad_norm_var": 0.010228474934895834, "learning_rate": 0.0001, "loss": 4.1922, "loss/crossentropy": 2.2650365829467773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22090423107147217, "step": 18756 }, { "epoch": 0.37516, "grad_norm": 1.921875, "grad_norm_var": 0.01002197265625, "learning_rate": 0.0001, "loss": 4.1985, "loss/crossentropy": 2.252619981765747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20853855460882187, "step": 18758 }, { "epoch": 0.3752, "grad_norm": 2.109375, "grad_norm_var": 0.011766560872395833, "learning_rate": 0.0001, "loss": 3.8947, "loss/crossentropy": 2.022092342376709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19293325394392014, "step": 18760 }, { "epoch": 0.37524, "grad_norm": 2.078125, "grad_norm_var": 0.010081990559895834, "learning_rate": 0.0001, "loss": 4.0289, "loss/crossentropy": 1.7500890493392944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18832595646381378, "step": 18762 }, { "epoch": 0.37528, "grad_norm": 1.8515625, "grad_norm_var": 0.009220123291015625, "learning_rate": 0.0001, "loss": 3.7769, "loss/crossentropy": 1.8201736211776733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910964399576187, "step": 18764 }, { "epoch": 0.37532, "grad_norm": 1.9375, "grad_norm_var": 0.009989420572916666, "learning_rate": 0.0001, "loss": 4.0508, "loss/crossentropy": 2.1644541025161743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20023848116397858, "step": 18766 }, { "epoch": 0.37536, "grad_norm": 2.078125, "grad_norm_var": 0.010436757405598959, "learning_rate": 0.0001, "loss": 4.163, "loss/crossentropy": 2.275505781173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23095671832561493, "step": 18768 }, { "epoch": 0.3754, "grad_norm": 2.078125, "grad_norm_var": 0.010773722330729167, "learning_rate": 0.0001, "loss": 4.0557, "loss/crossentropy": 1.976850986480713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125934213399887, "step": 18770 }, { "epoch": 0.37544, "grad_norm": 2.140625, "grad_norm_var": 0.012035115559895834, "learning_rate": 0.0001, "loss": 4.146, "loss/crossentropy": 2.150836706161499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19192685186862946, "step": 18772 }, { "epoch": 0.37548, "grad_norm": 2.046875, "grad_norm_var": 0.012400054931640625, "learning_rate": 0.0001, "loss": 4.2302, "loss/crossentropy": 2.0790088176727295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147749587893486, "step": 18774 }, { "epoch": 0.37552, "grad_norm": 1.9765625, "grad_norm_var": 0.010697428385416667, "learning_rate": 0.0001, "loss": 3.7918, "loss/crossentropy": 1.8366054892539978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19662949442863464, "step": 18776 }, { "epoch": 0.37556, "grad_norm": 2.015625, "grad_norm_var": 0.008278147379557291, "learning_rate": 0.0001, "loss": 3.9056, "loss/crossentropy": 1.6928801536560059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18292289972305298, "step": 18778 }, { "epoch": 0.3756, "grad_norm": 1.96875, "grad_norm_var": 0.007591756184895834, "learning_rate": 0.0001, "loss": 4.2872, "loss/crossentropy": 2.015128195285797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20940368622541428, "step": 18780 }, { "epoch": 0.37564, "grad_norm": 2.125, "grad_norm_var": 0.0061757405598958336, "learning_rate": 0.0001, "loss": 4.3304, "loss/crossentropy": 2.026209592819214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227947860956192, "step": 18782 }, { "epoch": 0.37568, "grad_norm": 2.0, "grad_norm_var": 0.004349772135416667, "learning_rate": 0.0001, "loss": 4.0591, "loss/crossentropy": 2.29964280128479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23132510483264923, "step": 18784 }, { "epoch": 0.37572, "grad_norm": 1.859375, "grad_norm_var": 0.005366770426432291, "learning_rate": 0.0001, "loss": 3.9675, "loss/crossentropy": 1.881381332874298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17967566847801208, "step": 18786 }, { "epoch": 0.37576, "grad_norm": 2.046875, "grad_norm_var": 0.004198201497395833, "learning_rate": 0.0001, "loss": 4.1834, "loss/crossentropy": 2.2231001257896423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21163878589868546, "step": 18788 }, { "epoch": 0.3758, "grad_norm": 1.8984375, "grad_norm_var": 0.004526519775390625, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 2.3531078100204468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21745989471673965, "step": 18790 }, { "epoch": 0.37584, "grad_norm": 2.046875, "grad_norm_var": 0.004801432291666667, "learning_rate": 0.0001, "loss": 4.2549, "loss/crossentropy": 2.1178460121154785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23571017384529114, "step": 18792 }, { "epoch": 0.37588, "grad_norm": 1.84375, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 3.8373, "loss/crossentropy": 2.0659135580062866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20817245543003082, "step": 18794 }, { "epoch": 0.37592, "grad_norm": 1.9140625, "grad_norm_var": 0.006318918863932292, "learning_rate": 0.0001, "loss": 4.0103, "loss/crossentropy": 1.579395353794098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16713083535432816, "step": 18796 }, { "epoch": 0.37596, "grad_norm": 1.953125, "grad_norm_var": 0.005940500895182292, "learning_rate": 0.0001, "loss": 4.1272, "loss/crossentropy": 2.102541923522949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2174636349081993, "step": 18798 }, { "epoch": 0.376, "grad_norm": 2.09375, "grad_norm_var": 0.0067942301432291664, "learning_rate": 0.0001, "loss": 4.2622, "loss/crossentropy": 1.9342190027236938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19780533015727997, "step": 18800 }, { "epoch": 0.37604, "grad_norm": 1.9921875, "grad_norm_var": 0.006121571858723958, "learning_rate": 0.0001, "loss": 4.0659, "loss/crossentropy": 2.140671730041504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21276966482400894, "step": 18802 }, { "epoch": 0.37608, "grad_norm": 2.21875, "grad_norm_var": 0.009590657552083333, "learning_rate": 0.0001, "loss": 4.1929, "loss/crossentropy": 1.9004405736923218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17683426290750504, "step": 18804 }, { "epoch": 0.37612, "grad_norm": 1.859375, "grad_norm_var": 0.010155232747395833, "learning_rate": 0.0001, "loss": 3.7163, "loss/crossentropy": 1.9487649202346802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19463464617729187, "step": 18806 }, { "epoch": 0.37616, "grad_norm": 1.8828125, "grad_norm_var": 0.011262766520182292, "learning_rate": 0.0001, "loss": 3.9517, "loss/crossentropy": 2.15252423286438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20531394332647324, "step": 18808 }, { "epoch": 0.3762, "grad_norm": 1.8359375, "grad_norm_var": 0.011592356363932292, "learning_rate": 0.0001, "loss": 4.0494, "loss/crossentropy": 2.114433467388153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183662325143814, "step": 18810 }, { "epoch": 0.37624, "grad_norm": 2.0, "grad_norm_var": 0.011205037434895834, "learning_rate": 0.0001, "loss": 3.9035, "loss/crossentropy": 1.9229055047035217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18662934005260468, "step": 18812 }, { "epoch": 0.37628, "grad_norm": 2.046875, "grad_norm_var": 1.8609944661458333, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 1.8242689371109009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17858239263296127, "step": 18814 }, { "epoch": 0.37632, "grad_norm": 1.9765625, "grad_norm_var": 1.864818318684896, "learning_rate": 0.0001, "loss": 3.8917, "loss/crossentropy": 2.3987890481948853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2246004194021225, "step": 18816 }, { "epoch": 0.37636, "grad_norm": 2.234375, "grad_norm_var": 1.8565500895182292, "learning_rate": 0.0001, "loss": 4.3493, "loss/crossentropy": 2.4376614093780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24541915208101273, "step": 18818 }, { "epoch": 0.3764, "grad_norm": 1.984375, "grad_norm_var": 1.865612538655599, "learning_rate": 0.0001, "loss": 4.1344, "loss/crossentropy": 2.2477601766586304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2178046852350235, "step": 18820 }, { "epoch": 0.37644, "grad_norm": 1.96875, "grad_norm_var": 1.8636464436848958, "learning_rate": 0.0001, "loss": 3.9685, "loss/crossentropy": 1.9782747626304626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19459142535924911, "step": 18822 }, { "epoch": 0.37648, "grad_norm": 2.0625, "grad_norm_var": 1.8451738993326823, "learning_rate": 0.0001, "loss": 4.043, "loss/crossentropy": 2.1214100122451782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082664743065834, "step": 18824 }, { "epoch": 0.37652, "grad_norm": 1.90625, "grad_norm_var": 1.8489461263020834, "learning_rate": 0.0001, "loss": 3.8256, "loss/crossentropy": 1.68446546792984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.167055182158947, "step": 18826 }, { "epoch": 0.37656, "grad_norm": 1.9921875, "grad_norm_var": 1.8380022684733073, "learning_rate": 0.0001, "loss": 4.0588, "loss/crossentropy": 1.7720499634742737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18632127344608307, "step": 18828 }, { "epoch": 0.3766, "grad_norm": 2.140625, "grad_norm_var": 0.013492838541666666, "learning_rate": 0.0001, "loss": 4.2288, "loss/crossentropy": 2.283332347869873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23949767649173737, "step": 18830 }, { "epoch": 0.37664, "grad_norm": 1.9609375, "grad_norm_var": 0.014644114176432292, "learning_rate": 0.0001, "loss": 3.9134, "loss/crossentropy": 1.9924857020378113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20181651413440704, "step": 18832 }, { "epoch": 0.37668, "grad_norm": 2.046875, "grad_norm_var": 0.011091105143229167, "learning_rate": 0.0001, "loss": 3.8424, "loss/crossentropy": 2.4015761613845825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279737964272499, "step": 18834 }, { "epoch": 0.37672, "grad_norm": 1.8515625, "grad_norm_var": 0.014152018229166667, "learning_rate": 0.0001, "loss": 3.8636, "loss/crossentropy": 2.0717111229896545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19089852273464203, "step": 18836 }, { "epoch": 0.37676, "grad_norm": 2.09375, "grad_norm_var": 0.0149566650390625, "learning_rate": 0.0001, "loss": 4.2694, "loss/crossentropy": 1.9993728995323181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19489262998104095, "step": 18838 }, { "epoch": 0.3768, "grad_norm": 1.8125, "grad_norm_var": 0.015949503580729166, "learning_rate": 0.0001, "loss": 3.7009, "loss/crossentropy": 1.915448248386383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923104077577591, "step": 18840 }, { "epoch": 0.37684, "grad_norm": 1.890625, "grad_norm_var": 0.016511027018229166, "learning_rate": 0.0001, "loss": 4.0424, "loss/crossentropy": 2.292284607887268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22159253805875778, "step": 18842 }, { "epoch": 0.37688, "grad_norm": 1.9921875, "grad_norm_var": 0.012784830729166667, "learning_rate": 0.0001, "loss": 3.9904, "loss/crossentropy": 1.9938938617706299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19937817752361298, "step": 18844 }, { "epoch": 0.37692, "grad_norm": 1.9453125, "grad_norm_var": 0.016454060872395832, "learning_rate": 0.0001, "loss": 4.1914, "loss/crossentropy": 2.045258641242981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2652823179960251, "step": 18846 }, { "epoch": 0.37696, "grad_norm": 1.9921875, "grad_norm_var": 0.016707356770833334, "learning_rate": 0.0001, "loss": 4.0391, "loss/crossentropy": 2.1330259442329407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19155749678611755, "step": 18848 }, { "epoch": 0.377, "grad_norm": 1.921875, "grad_norm_var": 0.017101796468098958, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 2.2135089635849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20062117278575897, "step": 18850 }, { "epoch": 0.37704, "grad_norm": 2.046875, "grad_norm_var": 0.013826243082682292, "learning_rate": 0.0001, "loss": 4.1188, "loss/crossentropy": 2.073060691356659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19443543255329132, "step": 18852 }, { "epoch": 0.37708, "grad_norm": 1.96875, "grad_norm_var": 0.014170074462890625, "learning_rate": 0.0001, "loss": 4.089, "loss/crossentropy": 1.930641233921051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19604724645614624, "step": 18854 }, { "epoch": 0.37712, "grad_norm": 2.015625, "grad_norm_var": 0.011252593994140626, "learning_rate": 0.0001, "loss": 4.0238, "loss/crossentropy": 2.0813130140304565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21019181609153748, "step": 18856 }, { "epoch": 0.37716, "grad_norm": 2.03125, "grad_norm_var": 0.010080718994140625, "learning_rate": 0.0001, "loss": 3.9855, "loss/crossentropy": 1.8528355956077576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19945576786994934, "step": 18858 }, { "epoch": 0.3772, "grad_norm": 2.0, "grad_norm_var": 0.011896769205729166, "learning_rate": 0.0001, "loss": 3.9655, "loss/crossentropy": 2.32794725894928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22688604146242142, "step": 18860 }, { "epoch": 0.37724, "grad_norm": 2.046875, "grad_norm_var": 0.007264963785807292, "learning_rate": 0.0001, "loss": 4.25, "loss/crossentropy": 2.061101734638214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1961553767323494, "step": 18862 }, { "epoch": 0.37728, "grad_norm": 1.9609375, "grad_norm_var": 0.00604248046875, "learning_rate": 0.0001, "loss": 3.7839, "loss/crossentropy": 1.9127929210662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20082338899374008, "step": 18864 }, { "epoch": 0.37732, "grad_norm": 2.046875, "grad_norm_var": 0.14662272135416668, "learning_rate": 0.0001, "loss": 4.289, "loss/crossentropy": 2.001932919025421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881283074617386, "step": 18866 }, { "epoch": 0.37736, "grad_norm": 1.9609375, "grad_norm_var": 0.14527180989583333, "learning_rate": 0.0001, "loss": 4.0851, "loss/crossentropy": 1.693075716495514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698371946811676, "step": 18868 }, { "epoch": 0.3774, "grad_norm": 1.984375, "grad_norm_var": 0.1481402079264323, "learning_rate": 0.0001, "loss": 4.0484, "loss/crossentropy": 1.879252314567566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18412816524505615, "step": 18870 }, { "epoch": 0.37744, "grad_norm": 2.015625, "grad_norm_var": 0.1550066630045573, "learning_rate": 0.0001, "loss": 4.3488, "loss/crossentropy": 2.1819299459457397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20243355631828308, "step": 18872 }, { "epoch": 0.37748, "grad_norm": 2.21875, "grad_norm_var": 0.1576568603515625, "learning_rate": 0.0001, "loss": 4.0889, "loss/crossentropy": 2.2487794160842896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20594948530197144, "step": 18874 }, { "epoch": 0.37752, "grad_norm": 1.8984375, "grad_norm_var": 0.15466079711914063, "learning_rate": 0.0001, "loss": 4.303, "loss/crossentropy": 1.990889549255371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230055809020996, "step": 18876 }, { "epoch": 0.37756, "grad_norm": 1.9765625, "grad_norm_var": 0.1582763671875, "learning_rate": 0.0001, "loss": 4.0435, "loss/crossentropy": 2.091560959815979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19504332542419434, "step": 18878 }, { "epoch": 0.3776, "grad_norm": 2.109375, "grad_norm_var": 0.1599273681640625, "learning_rate": 0.0001, "loss": 4.0776, "loss/crossentropy": 1.9213955998420715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994621828198433, "step": 18880 }, { "epoch": 0.37764, "grad_norm": 1.8671875, "grad_norm_var": 0.02360814412434896, "learning_rate": 0.0001, "loss": 4.0103, "loss/crossentropy": 2.1728278398513794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21039214730262756, "step": 18882 }, { "epoch": 0.37768, "grad_norm": 1.78125, "grad_norm_var": 0.026341756184895832, "learning_rate": 0.0001, "loss": 4.052, "loss/crossentropy": 1.889222264289856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17232363671064377, "step": 18884 }, { "epoch": 0.37772, "grad_norm": 1.9921875, "grad_norm_var": 0.026228841145833334, "learning_rate": 0.0001, "loss": 4.118, "loss/crossentropy": 2.264981746673584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213841512799263, "step": 18886 }, { "epoch": 0.37776, "grad_norm": 1.78125, "grad_norm_var": 0.013598378499348958, "learning_rate": 0.0001, "loss": 3.8843, "loss/crossentropy": 1.8615361452102661, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1745932251214981, "step": 18888 }, { "epoch": 0.3778, "grad_norm": 1.765625, "grad_norm_var": 0.009266916910807292, "learning_rate": 0.0001, "loss": 3.5687, "loss/crossentropy": 2.1459723711013794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21431762725114822, "step": 18890 }, { "epoch": 0.37784, "grad_norm": 2.0625, "grad_norm_var": 0.010479482014973958, "learning_rate": 0.0001, "loss": 4.2125, "loss/crossentropy": 1.8859283328056335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20627497136592865, "step": 18892 }, { "epoch": 0.37788, "grad_norm": 2.046875, "grad_norm_var": 0.012211100260416666, "learning_rate": 0.0001, "loss": 3.9978, "loss/crossentropy": 1.824287474155426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17877592146396637, "step": 18894 }, { "epoch": 0.37792, "grad_norm": 1.953125, "grad_norm_var": 0.012414296468098959, "learning_rate": 0.0001, "loss": 3.9857, "loss/crossentropy": 2.486180305480957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033834606409073, "step": 18896 }, { "epoch": 0.37796, "grad_norm": 2.046875, "grad_norm_var": 0.012981923421223958, "learning_rate": 0.0001, "loss": 4.2339, "loss/crossentropy": 2.0677965879440308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21111362427473068, "step": 18898 }, { "epoch": 0.378, "grad_norm": 1.9921875, "grad_norm_var": 0.011107381184895833, "learning_rate": 0.0001, "loss": 4.2559, "loss/crossentropy": 1.941792368888855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18762809038162231, "step": 18900 }, { "epoch": 0.37804, "grad_norm": 2.078125, "grad_norm_var": 0.012504069010416667, "learning_rate": 0.0001, "loss": 3.8028, "loss/crossentropy": 1.907107174396515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050766795873642, "step": 18902 }, { "epoch": 0.37808, "grad_norm": 1.9453125, "grad_norm_var": 0.009242502848307292, "learning_rate": 0.0001, "loss": 3.9742, "loss/crossentropy": 1.5920222997665405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17366845160722733, "step": 18904 }, { "epoch": 0.37812, "grad_norm": 1.9375, "grad_norm_var": 0.0066650390625, "learning_rate": 0.0001, "loss": 3.9581, "loss/crossentropy": 2.054854154586792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20102889090776443, "step": 18906 }, { "epoch": 0.37816, "grad_norm": 1.90625, "grad_norm_var": 0.0067789713541666664, "learning_rate": 0.0001, "loss": 4.2582, "loss/crossentropy": 2.42835795879364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2154095396399498, "step": 18908 }, { "epoch": 0.3782, "grad_norm": 2.09375, "grad_norm_var": 0.007169596354166667, "learning_rate": 0.0001, "loss": 4.1466, "loss/crossentropy": 2.1060370206832886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20842333883047104, "step": 18910 }, { "epoch": 0.37824, "grad_norm": 1.96875, "grad_norm_var": 0.008194986979166667, "learning_rate": 0.0001, "loss": 4.1587, "loss/crossentropy": 2.0559674501419067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2195875570178032, "step": 18912 }, { "epoch": 0.37828, "grad_norm": 2.03125, "grad_norm_var": 0.008888498942057291, "learning_rate": 0.0001, "loss": 4.2786, "loss/crossentropy": 2.240954041481018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20908969640731812, "step": 18914 }, { "epoch": 0.37832, "grad_norm": 1.96875, "grad_norm_var": 0.008747355143229166, "learning_rate": 0.0001, "loss": 4.0729, "loss/crossentropy": 2.0393139123916626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19933026283979416, "step": 18916 }, { "epoch": 0.37836, "grad_norm": 2.109375, "grad_norm_var": 0.007682291666666666, "learning_rate": 0.0001, "loss": 4.0969, "loss/crossentropy": 2.0294516682624817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20086780190467834, "step": 18918 }, { "epoch": 0.3784, "grad_norm": 1.96875, "grad_norm_var": 0.007425944010416667, "learning_rate": 0.0001, "loss": 3.9138, "loss/crossentropy": 1.8509765267372131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18834587186574936, "step": 18920 }, { "epoch": 0.37844, "grad_norm": 1.875, "grad_norm_var": 0.0068267822265625, "learning_rate": 0.0001, "loss": 3.8669, "loss/crossentropy": 1.8817378878593445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17679119110107422, "step": 18922 }, { "epoch": 0.37848, "grad_norm": 1.9921875, "grad_norm_var": 0.005975087483723958, "learning_rate": 0.0001, "loss": 3.9276, "loss/crossentropy": 1.766166627407074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16948368400335312, "step": 18924 }, { "epoch": 0.37852, "grad_norm": 1.8671875, "grad_norm_var": 0.006786855061848959, "learning_rate": 0.0001, "loss": 3.8948, "loss/crossentropy": 2.251446485519409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22272750735282898, "step": 18926 }, { "epoch": 0.37856, "grad_norm": 1.9609375, "grad_norm_var": 0.0045125325520833336, "learning_rate": 0.0001, "loss": 4.1119, "loss/crossentropy": 2.0006097555160522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003134548664093, "step": 18928 }, { "epoch": 0.3786, "grad_norm": 2.03125, "grad_norm_var": 0.0036936442057291666, "learning_rate": 0.0001, "loss": 3.7731, "loss/crossentropy": 2.2906641960144043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23650038242340088, "step": 18930 }, { "epoch": 0.37864, "grad_norm": 2.03125, "grad_norm_var": 0.0039866129557291664, "learning_rate": 0.0001, "loss": 4.1848, "loss/crossentropy": 2.185898005962372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21387950330972672, "step": 18932 }, { "epoch": 0.37868, "grad_norm": 1.9765625, "grad_norm_var": 0.0028928120930989585, "learning_rate": 0.0001, "loss": 4.1808, "loss/crossentropy": 2.2457324266433716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21054718643426895, "step": 18934 }, { "epoch": 0.37872, "grad_norm": 2.046875, "grad_norm_var": 0.0032793680826822915, "learning_rate": 0.0001, "loss": 4.2711, "loss/crossentropy": 2.1835416555404663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23258651047945023, "step": 18936 }, { "epoch": 0.37876, "grad_norm": 2.078125, "grad_norm_var": 0.0029436747233072915, "learning_rate": 0.0001, "loss": 4.1555, "loss/crossentropy": 2.1908507347106934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20261384546756744, "step": 18938 }, { "epoch": 0.3788, "grad_norm": 2.078125, "grad_norm_var": 0.0035336812337239585, "learning_rate": 0.0001, "loss": 4.1359, "loss/crossentropy": 2.276697278022766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20843130350112915, "step": 18940 }, { "epoch": 0.37884, "grad_norm": 1.9375, "grad_norm_var": 0.0023251851399739582, "learning_rate": 0.0001, "loss": 4.1309, "loss/crossentropy": 2.16153222322464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19693513214588165, "step": 18942 }, { "epoch": 0.37888, "grad_norm": 1.96875, "grad_norm_var": 0.0023590087890625, "learning_rate": 0.0001, "loss": 3.9822, "loss/crossentropy": 2.293270707130432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21085364371538162, "step": 18944 }, { "epoch": 0.37892, "grad_norm": 1.9609375, "grad_norm_var": 0.0025042215983072918, "learning_rate": 0.0001, "loss": 4.1291, "loss/crossentropy": 2.0382518768310547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21567383408546448, "step": 18946 }, { "epoch": 0.37896, "grad_norm": 1.8359375, "grad_norm_var": 0.00894775390625, "learning_rate": 0.0001, "loss": 3.9993, "loss/crossentropy": 2.038296341896057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19963379949331284, "step": 18948 }, { "epoch": 0.379, "grad_norm": 1.953125, "grad_norm_var": 0.00994873046875, "learning_rate": 0.0001, "loss": 4.2363, "loss/crossentropy": 2.1765074729919434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19551265239715576, "step": 18950 }, { "epoch": 0.37904, "grad_norm": 2.015625, "grad_norm_var": 0.010591379801432292, "learning_rate": 0.0001, "loss": 3.8641, "loss/crossentropy": 1.870033621788025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17858022451400757, "step": 18952 }, { "epoch": 0.37908, "grad_norm": 2.0625, "grad_norm_var": 0.011083730061848958, "learning_rate": 0.0001, "loss": 4.1877, "loss/crossentropy": 1.8716632723808289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20534101128578186, "step": 18954 }, { "epoch": 0.37912, "grad_norm": 1.8984375, "grad_norm_var": 0.0111236572265625, "learning_rate": 0.0001, "loss": 4.1662, "loss/crossentropy": 1.925970435142517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19596435129642487, "step": 18956 }, { "epoch": 0.37916, "grad_norm": 1.890625, "grad_norm_var": 0.011800130208333334, "learning_rate": 0.0001, "loss": 4.126, "loss/crossentropy": 2.2280589938163757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21037188172340393, "step": 18958 }, { "epoch": 0.3792, "grad_norm": 2.0, "grad_norm_var": 0.0150299072265625, "learning_rate": 0.0001, "loss": 3.7683, "loss/crossentropy": 1.8350458145141602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821446716785431, "step": 18960 }, { "epoch": 0.37924, "grad_norm": 1.953125, "grad_norm_var": 0.015018463134765625, "learning_rate": 0.0001, "loss": 4.0376, "loss/crossentropy": 2.0393940210342407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18378175050020218, "step": 18962 }, { "epoch": 0.37928, "grad_norm": 1.9375, "grad_norm_var": 0.00897216796875, "learning_rate": 0.0001, "loss": 4.0018, "loss/crossentropy": 1.7470228672027588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19222351908683777, "step": 18964 }, { "epoch": 0.37932, "grad_norm": 2.203125, "grad_norm_var": 0.011742146809895833, "learning_rate": 0.0001, "loss": 4.1791, "loss/crossentropy": 2.3943980932235718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23214909434318542, "step": 18966 }, { "epoch": 0.37936, "grad_norm": 1.90625, "grad_norm_var": 0.012239329020182292, "learning_rate": 0.0001, "loss": 3.7877, "loss/crossentropy": 1.8153263330459595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19808300584554672, "step": 18968 }, { "epoch": 0.3794, "grad_norm": 2.0625, "grad_norm_var": 0.011171213785807292, "learning_rate": 0.0001, "loss": 4.0122, "loss/crossentropy": 2.0037755370140076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20173701643943787, "step": 18970 }, { "epoch": 0.37944, "grad_norm": 1.8828125, "grad_norm_var": 0.012168121337890626, "learning_rate": 0.0001, "loss": 4.1069, "loss/crossentropy": 2.144823908805847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21421342343091965, "step": 18972 }, { "epoch": 0.37948, "grad_norm": 2.015625, "grad_norm_var": 0.011565907796223959, "learning_rate": 0.0001, "loss": 4.2788, "loss/crossentropy": 2.2151081562042236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20057211816310883, "step": 18974 }, { "epoch": 0.37952, "grad_norm": 2.03125, "grad_norm_var": 0.008976236979166666, "learning_rate": 0.0001, "loss": 4.5061, "loss/crossentropy": 2.341770827770233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21535535156726837, "step": 18976 }, { "epoch": 0.37956, "grad_norm": 2.109375, "grad_norm_var": 0.009520467122395833, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 1.8952747583389282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17530933022499084, "step": 18978 }, { "epoch": 0.3796, "grad_norm": 2.03125, "grad_norm_var": 0.008885701497395834, "learning_rate": 0.0001, "loss": 4.141, "loss/crossentropy": 2.2774226665496826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20682721585035324, "step": 18980 }, { "epoch": 0.37964, "grad_norm": 2.078125, "grad_norm_var": 0.008508046468098959, "learning_rate": 0.0001, "loss": 3.982, "loss/crossentropy": 1.9596800208091736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18774595111608505, "step": 18982 }, { "epoch": 0.37968, "grad_norm": 1.8359375, "grad_norm_var": 0.009309895833333333, "learning_rate": 0.0001, "loss": 4.008, "loss/crossentropy": 2.1485220193862915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19221040606498718, "step": 18984 }, { "epoch": 0.37972, "grad_norm": 1.9296875, "grad_norm_var": 0.009039052327473958, "learning_rate": 0.0001, "loss": 4.142, "loss/crossentropy": 2.2140207290649414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012096345424652, "step": 18986 }, { "epoch": 0.37976, "grad_norm": 1.890625, "grad_norm_var": 0.008063761393229167, "learning_rate": 0.0001, "loss": 4.1088, "loss/crossentropy": 1.9045360684394836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19290773570537567, "step": 18988 }, { "epoch": 0.3798, "grad_norm": 2.03125, "grad_norm_var": 0.008072662353515624, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 1.8351407051086426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18892163783311844, "step": 18990 }, { "epoch": 0.37984, "grad_norm": 1.8984375, "grad_norm_var": 0.007865397135416667, "learning_rate": 0.0001, "loss": 3.9136, "loss/crossentropy": 2.021119713783264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19214972108602524, "step": 18992 }, { "epoch": 0.37988, "grad_norm": 2.0, "grad_norm_var": 0.006394195556640625, "learning_rate": 0.0001, "loss": 4.2961, "loss/crossentropy": 2.0591527223587036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20295391231775284, "step": 18994 }, { "epoch": 0.37992, "grad_norm": 2.0625, "grad_norm_var": 0.006566365559895833, "learning_rate": 0.0001, "loss": 4.1135, "loss/crossentropy": 1.9794987440109253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20964892208576202, "step": 18996 }, { "epoch": 0.37996, "grad_norm": 2.078125, "grad_norm_var": 0.0042111714680989586, "learning_rate": 0.0001, "loss": 4.0699, "loss/crossentropy": 2.116178512573242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962205320596695, "step": 18998 }, { "epoch": 0.38, "grad_norm": 1.8828125, "grad_norm_var": 0.003714752197265625, "learning_rate": 0.0001, "loss": 4.0683, "loss/crossentropy": 2.2458595037460327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981743946671486, "step": 19000 }, { "epoch": 0.38004, "grad_norm": 1.9140625, "grad_norm_var": 0.0038266499837239583, "learning_rate": 0.0001, "loss": 4.2085, "loss/crossentropy": 2.0612659454345703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20526322722434998, "step": 19002 }, { "epoch": 0.38008, "grad_norm": 1.9296875, "grad_norm_var": 0.00347900390625, "learning_rate": 0.0001, "loss": 4.191, "loss/crossentropy": 2.0810243487358093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.222077377140522, "step": 19004 }, { "epoch": 0.38012, "grad_norm": 1.890625, "grad_norm_var": 0.003940582275390625, "learning_rate": 0.0001, "loss": 4.0957, "loss/crossentropy": 2.2651617527008057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106148600578308, "step": 19006 }, { "epoch": 0.38016, "grad_norm": 1.8125, "grad_norm_var": 0.0056793212890625, "learning_rate": 0.0001, "loss": 3.9671, "loss/crossentropy": 1.9701108932495117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18778277933597565, "step": 19008 }, { "epoch": 0.3802, "grad_norm": 1.859375, "grad_norm_var": 0.007024892171223958, "learning_rate": 0.0001, "loss": 4.1265, "loss/crossentropy": 1.9822896122932434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25280050933361053, "step": 19010 }, { "epoch": 0.38024, "grad_norm": 2.0, "grad_norm_var": 0.0063435872395833336, "learning_rate": 0.0001, "loss": 3.9577, "loss/crossentropy": 2.020545542240143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19008295983076096, "step": 19012 }, { "epoch": 0.38028, "grad_norm": 1.9296875, "grad_norm_var": 0.007336171468098959, "learning_rate": 0.0001, "loss": 4.2044, "loss/crossentropy": 2.0922394394874573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972956582903862, "step": 19014 }, { "epoch": 0.38032, "grad_norm": 1.859375, "grad_norm_var": 0.007067616780598958, "learning_rate": 0.0001, "loss": 3.9726, "loss/crossentropy": 2.028052031993866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18549586832523346, "step": 19016 }, { "epoch": 0.38036, "grad_norm": 1.6796875, "grad_norm_var": 0.011395009358723958, "learning_rate": 0.0001, "loss": 4.0118, "loss/crossentropy": 1.9518752098083496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17747169733047485, "step": 19018 }, { "epoch": 0.3804, "grad_norm": 2.078125, "grad_norm_var": 0.013492838541666666, "learning_rate": 0.0001, "loss": 4.1404, "loss/crossentropy": 1.8896766901016235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20509368181228638, "step": 19020 }, { "epoch": 0.38044, "grad_norm": 2.046875, "grad_norm_var": 0.01407470703125, "learning_rate": 0.0001, "loss": 4.1159, "loss/crossentropy": 2.10469388961792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058955579996109, "step": 19022 }, { "epoch": 0.38048, "grad_norm": 1.875, "grad_norm_var": 0.012717437744140626, "learning_rate": 0.0001, "loss": 3.8601, "loss/crossentropy": 1.7341394424438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19358114898204803, "step": 19024 }, { "epoch": 0.38052, "grad_norm": 1.7734375, "grad_norm_var": 0.013634999593098959, "learning_rate": 0.0001, "loss": 3.8528, "loss/crossentropy": 2.186724543571472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981809437274933, "step": 19026 }, { "epoch": 0.38056, "grad_norm": 1.8984375, "grad_norm_var": 0.014070638020833333, "learning_rate": 0.0001, "loss": 4.2421, "loss/crossentropy": 2.228869318962097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20712755620479584, "step": 19028 }, { "epoch": 0.3806, "grad_norm": 2.125, "grad_norm_var": 0.0140869140625, "learning_rate": 0.0001, "loss": 4.126, "loss/crossentropy": 2.2783373594284058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916558057069778, "step": 19030 }, { "epoch": 0.38064, "grad_norm": 1.96875, "grad_norm_var": 0.013270823160807292, "learning_rate": 0.0001, "loss": 4.2744, "loss/crossentropy": 2.2671823501586914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19389723241329193, "step": 19032 }, { "epoch": 0.38068, "grad_norm": 1.859375, "grad_norm_var": 0.008540852864583334, "learning_rate": 0.0001, "loss": 4.0267, "loss/crossentropy": 1.948801040649414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18766260892152786, "step": 19034 }, { "epoch": 0.38072, "grad_norm": 1.90625, "grad_norm_var": 0.007818349202473958, "learning_rate": 0.0001, "loss": 4.0856, "loss/crossentropy": 1.895260751247406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848379746079445, "step": 19036 }, { "epoch": 0.38076, "grad_norm": 1.8046875, "grad_norm_var": 0.008504231770833334, "learning_rate": 0.0001, "loss": 3.8189, "loss/crossentropy": 1.8295226097106934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816689372062683, "step": 19038 }, { "epoch": 0.3808, "grad_norm": 2.0, "grad_norm_var": 0.007830556233723958, "learning_rate": 0.0001, "loss": 4.0363, "loss/crossentropy": 1.6487661004066467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.178475059568882, "step": 19040 }, { "epoch": 0.38084, "grad_norm": 1.921875, "grad_norm_var": 0.0058095296223958336, "learning_rate": 0.0001, "loss": 3.828, "loss/crossentropy": 1.9264054894447327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009180784225464, "step": 19042 }, { "epoch": 0.38088, "grad_norm": 1.84375, "grad_norm_var": 0.007989247639973959, "learning_rate": 0.0001, "loss": 4.2636, "loss/crossentropy": 2.3018531799316406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039026990532875, "step": 19044 }, { "epoch": 0.38092, "grad_norm": 2.15625, "grad_norm_var": 0.009093983968098959, "learning_rate": 0.0001, "loss": 4.0201, "loss/crossentropy": 2.135006010532379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.172908216714859, "step": 19046 }, { "epoch": 0.38096, "grad_norm": 1.9140625, "grad_norm_var": 0.009476725260416667, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 2.1111900806427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19583145529031754, "step": 19048 }, { "epoch": 0.381, "grad_norm": 2.015625, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 4.1165, "loss/crossentropy": 2.208159327507019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21107421815395355, "step": 19050 }, { "epoch": 0.38104, "grad_norm": 2.203125, "grad_norm_var": 0.012894694010416667, "learning_rate": 0.0001, "loss": 4.1722, "loss/crossentropy": 2.144998788833618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108323574066162, "step": 19052 }, { "epoch": 0.38108, "grad_norm": 2.125, "grad_norm_var": 0.0116455078125, "learning_rate": 0.0001, "loss": 4.0938, "loss/crossentropy": 2.0046940445899963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20454590767621994, "step": 19054 }, { "epoch": 0.38112, "grad_norm": 2.03125, "grad_norm_var": 0.010992177327473958, "learning_rate": 0.0001, "loss": 4.2219, "loss/crossentropy": 2.042281448841095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21530431509017944, "step": 19056 }, { "epoch": 0.38116, "grad_norm": 2.09375, "grad_norm_var": 0.0102691650390625, "learning_rate": 0.0001, "loss": 4.2701, "loss/crossentropy": 2.0579177141189575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20769091695547104, "step": 19058 }, { "epoch": 0.3812, "grad_norm": 2.3125, "grad_norm_var": 0.012719472249348959, "learning_rate": 0.0001, "loss": 4.0561, "loss/crossentropy": 1.8340198993682861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1927536353468895, "step": 19060 }, { "epoch": 0.38124, "grad_norm": 2.375, "grad_norm_var": 0.016584269205729165, "learning_rate": 0.0001, "loss": 4.0412, "loss/crossentropy": 2.146699070930481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18840950727462769, "step": 19062 }, { "epoch": 0.38128, "grad_norm": 1.90625, "grad_norm_var": 0.017600250244140626, "learning_rate": 0.0001, "loss": 3.7448, "loss/crossentropy": 2.0206886529922485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20354034006595612, "step": 19064 }, { "epoch": 0.38132, "grad_norm": 1.78125, "grad_norm_var": 0.023372141520182292, "learning_rate": 0.0001, "loss": 4.0047, "loss/crossentropy": 1.9568690061569214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.183781698346138, "step": 19066 }, { "epoch": 0.38136, "grad_norm": 2.109375, "grad_norm_var": 0.022855631510416665, "learning_rate": 0.0001, "loss": 4.0898, "loss/crossentropy": 2.092573404312134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20443417131900787, "step": 19068 }, { "epoch": 0.3814, "grad_norm": 1.84375, "grad_norm_var": 0.025770823160807293, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.17366099357605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064819559454918, "step": 19070 }, { "epoch": 0.38144, "grad_norm": 1.9140625, "grad_norm_var": 0.0265777587890625, "learning_rate": 0.0001, "loss": 4.0398, "loss/crossentropy": 1.7935467958450317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19256697595119476, "step": 19072 }, { "epoch": 0.38148, "grad_norm": 1.9765625, "grad_norm_var": 0.026554107666015625, "learning_rate": 0.0001, "loss": 3.9846, "loss/crossentropy": 1.995141625404358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19474036246538162, "step": 19074 }, { "epoch": 0.38152, "grad_norm": 1.8203125, "grad_norm_var": 0.023221588134765624, "learning_rate": 0.0001, "loss": 3.9058, "loss/crossentropy": 2.140002131462097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19610393047332764, "step": 19076 }, { "epoch": 0.38156, "grad_norm": 2.0, "grad_norm_var": 0.013288370768229167, "learning_rate": 0.0001, "loss": 4.1176, "loss/crossentropy": 2.1740564107894897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20218566060066223, "step": 19078 }, { "epoch": 0.3816, "grad_norm": 2.1875, "grad_norm_var": 0.015925089518229168, "learning_rate": 0.0001, "loss": 4.3043, "loss/crossentropy": 2.065530776977539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21683599054813385, "step": 19080 }, { "epoch": 0.38164, "grad_norm": 2.0625, "grad_norm_var": 0.0127349853515625, "learning_rate": 0.0001, "loss": 4.2859, "loss/crossentropy": 1.844248354434967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19479060918092728, "step": 19082 }, { "epoch": 0.38168, "grad_norm": 1.9921875, "grad_norm_var": 0.011136627197265625, "learning_rate": 0.0001, "loss": 3.886, "loss/crossentropy": 1.8596556186676025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059667930006981, "step": 19084 }, { "epoch": 0.38172, "grad_norm": 2.234375, "grad_norm_var": 0.014731597900390626, "learning_rate": 0.0001, "loss": 4.0982, "loss/crossentropy": 2.08061683177948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20623548328876495, "step": 19086 }, { "epoch": 0.38176, "grad_norm": 2.03125, "grad_norm_var": 0.014615631103515625, "learning_rate": 0.0001, "loss": 4.0013, "loss/crossentropy": 1.888766884803772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18000122904777527, "step": 19088 }, { "epoch": 0.3818, "grad_norm": 1.9140625, "grad_norm_var": 0.017032877604166666, "learning_rate": 0.0001, "loss": 3.8784, "loss/crossentropy": 2.021374225616455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468255668878555, "step": 19090 }, { "epoch": 0.38184, "grad_norm": 2.0625, "grad_norm_var": 0.016805013020833332, "learning_rate": 0.0001, "loss": 3.8327, "loss/crossentropy": 2.083697557449341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21188172698020935, "step": 19092 }, { "epoch": 0.38188, "grad_norm": 2.171875, "grad_norm_var": 0.02019017537434896, "learning_rate": 0.0001, "loss": 3.9617, "loss/crossentropy": 2.1484656929969788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21978207677602768, "step": 19094 }, { "epoch": 0.38192, "grad_norm": 1.8671875, "grad_norm_var": 0.0218414306640625, "learning_rate": 0.0001, "loss": 4.1728, "loss/crossentropy": 2.321221709251404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22688252478837967, "step": 19096 }, { "epoch": 0.38196, "grad_norm": 2.703125, "grad_norm_var": 0.0518218994140625, "learning_rate": 0.0001, "loss": 4.157, "loss/crossentropy": 2.0623167753219604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148035168647766, "step": 19098 }, { "epoch": 0.382, "grad_norm": 2.234375, "grad_norm_var": 0.05501708984375, "learning_rate": 0.0001, "loss": 4.0255, "loss/crossentropy": 1.8845015168190002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1880025863647461, "step": 19100 }, { "epoch": 0.38204, "grad_norm": 1.9140625, "grad_norm_var": 0.05045750935872396, "learning_rate": 0.0001, "loss": 3.8361, "loss/crossentropy": 1.927463173866272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17446774244308472, "step": 19102 }, { "epoch": 0.38208, "grad_norm": 2.0, "grad_norm_var": 0.0536529541015625, "learning_rate": 0.0001, "loss": 4.2909, "loss/crossentropy": 2.6780699491500854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22951193153858185, "step": 19104 }, { "epoch": 0.38212, "grad_norm": 1.875, "grad_norm_var": 0.05358250935872396, "learning_rate": 0.0001, "loss": 3.9188, "loss/crossentropy": 2.2530760765075684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2141779437661171, "step": 19106 }, { "epoch": 0.38216, "grad_norm": 1.90625, "grad_norm_var": 0.07016499837239583, "learning_rate": 0.0001, "loss": 4.1647, "loss/crossentropy": 2.0292217135429382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20607471466064453, "step": 19108 }, { "epoch": 0.3822, "grad_norm": 2.015625, "grad_norm_var": 0.07394790649414062, "learning_rate": 0.0001, "loss": 3.8144, "loss/crossentropy": 1.8341345191001892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18437029421329498, "step": 19110 }, { "epoch": 0.38224, "grad_norm": 2.078125, "grad_norm_var": 0.06982421875, "learning_rate": 0.0001, "loss": 4.1782, "loss/crossentropy": 1.9262341260910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894589066505432, "step": 19112 }, { "epoch": 0.38228, "grad_norm": 1.9609375, "grad_norm_var": 0.04212824503580729, "learning_rate": 0.0001, "loss": 4.2473, "loss/crossentropy": 1.9369717240333557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17737293988466263, "step": 19114 }, { "epoch": 0.38232, "grad_norm": 1.953125, "grad_norm_var": 0.03845621744791667, "learning_rate": 0.0001, "loss": 4.0904, "loss/crossentropy": 1.7547513842582703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895698457956314, "step": 19116 }, { "epoch": 0.38236, "grad_norm": 2.15625, "grad_norm_var": 0.042789459228515625, "learning_rate": 0.0001, "loss": 4.2814, "loss/crossentropy": 2.2367511987686157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22208665311336517, "step": 19118 }, { "epoch": 0.3824, "grad_norm": 1.890625, "grad_norm_var": 0.040169016520182295, "learning_rate": 0.0001, "loss": 4.1671, "loss/crossentropy": 2.248104691505432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132478505373001, "step": 19120 }, { "epoch": 0.38244, "grad_norm": 1.7890625, "grad_norm_var": 0.040415191650390626, "learning_rate": 0.0001, "loss": 4.0371, "loss/crossentropy": 2.237170696258545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21920116990804672, "step": 19122 }, { "epoch": 0.38248, "grad_norm": 1.84375, "grad_norm_var": 0.018027496337890626, "learning_rate": 0.0001, "loss": 4.1011, "loss/crossentropy": 1.82357919216156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1723759025335312, "step": 19124 }, { "epoch": 0.38252, "grad_norm": 1.8046875, "grad_norm_var": 0.018668619791666667, "learning_rate": 0.0001, "loss": 3.8541, "loss/crossentropy": 1.9371765851974487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19782865047454834, "step": 19126 }, { "epoch": 0.38256, "grad_norm": 2.1875, "grad_norm_var": 0.0208404541015625, "learning_rate": 0.0001, "loss": 4.1305, "loss/crossentropy": 1.8295999765396118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756351813673973, "step": 19128 }, { "epoch": 0.3826, "grad_norm": 2.046875, "grad_norm_var": 0.044077301025390626, "learning_rate": 0.0001, "loss": 4.0657, "loss/crossentropy": 2.0991050601005554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17175115644931793, "step": 19130 }, { "epoch": 0.38264, "grad_norm": 2.078125, "grad_norm_var": 0.04390640258789062, "learning_rate": 0.0001, "loss": 4.1359, "loss/crossentropy": 2.2707191705703735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22085107117891312, "step": 19132 }, { "epoch": 0.38268, "grad_norm": 2.015625, "grad_norm_var": 0.03630345662434896, "learning_rate": 0.0001, "loss": 4.352, "loss/crossentropy": 2.237929582595825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2524951994419098, "step": 19134 }, { "epoch": 0.38272, "grad_norm": 1.9609375, "grad_norm_var": 0.03629735310872396, "learning_rate": 0.0001, "loss": 4.0293, "loss/crossentropy": 1.9790211915969849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18156784772872925, "step": 19136 }, { "epoch": 0.38276, "grad_norm": 1.890625, "grad_norm_var": 0.034795888264973956, "learning_rate": 0.0001, "loss": 3.9467, "loss/crossentropy": 1.9819305539131165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18941760063171387, "step": 19138 }, { "epoch": 0.3828, "grad_norm": 2.046875, "grad_norm_var": 0.032022857666015626, "learning_rate": 0.0001, "loss": 3.9699, "loss/crossentropy": 1.9541404843330383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195252887904644, "step": 19140 }, { "epoch": 0.38284, "grad_norm": 2.0, "grad_norm_var": 0.028824869791666666, "learning_rate": 0.0001, "loss": 3.9563, "loss/crossentropy": 2.1265366673469543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20113253593444824, "step": 19142 }, { "epoch": 0.38288, "grad_norm": 1.8203125, "grad_norm_var": 0.0292724609375, "learning_rate": 0.0001, "loss": 4.0248, "loss/crossentropy": 1.767483413219452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18520043045282364, "step": 19144 }, { "epoch": 0.38292, "grad_norm": 2.0625, "grad_norm_var": 0.005352528889973959, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 2.1124974489212036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20909518748521805, "step": 19146 }, { "epoch": 0.38296, "grad_norm": 2.046875, "grad_norm_var": 0.005181630452473958, "learning_rate": 0.0001, "loss": 4.3104, "loss/crossentropy": 2.2959529161453247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2183123677968979, "step": 19148 }, { "epoch": 0.383, "grad_norm": 2.0625, "grad_norm_var": 0.005078125, "learning_rate": 0.0001, "loss": 4.3217, "loss/crossentropy": 2.3522391319274902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22062842547893524, "step": 19150 }, { "epoch": 0.38304, "grad_norm": 1.953125, "grad_norm_var": 0.0048095703125, "learning_rate": 0.0001, "loss": 3.7375, "loss/crossentropy": 1.6673399806022644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16702746599912643, "step": 19152 }, { "epoch": 0.38308, "grad_norm": 1.984375, "grad_norm_var": 0.004073079427083333, "learning_rate": 0.0001, "loss": 4.2058, "loss/crossentropy": 2.021436333656311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19833282381296158, "step": 19154 }, { "epoch": 0.38312, "grad_norm": 2.0, "grad_norm_var": 0.007439931233723958, "learning_rate": 0.0001, "loss": 3.7624, "loss/crossentropy": 1.4123128056526184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1485099121928215, "step": 19156 }, { "epoch": 0.38316, "grad_norm": 1.8125, "grad_norm_var": 0.009959920247395834, "learning_rate": 0.0001, "loss": 4.2364, "loss/crossentropy": 1.9472095966339111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16975026577711105, "step": 19158 }, { "epoch": 0.3832, "grad_norm": 2.078125, "grad_norm_var": 0.0094390869140625, "learning_rate": 0.0001, "loss": 4.2025, "loss/crossentropy": 2.112083077430725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198200300335884, "step": 19160 }, { "epoch": 0.38324, "grad_norm": 2.015625, "grad_norm_var": 0.0121246337890625, "learning_rate": 0.0001, "loss": 3.9661, "loss/crossentropy": 2.1301331520080566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039676010608673, "step": 19162 }, { "epoch": 0.38328, "grad_norm": 2.109375, "grad_norm_var": 0.019456990559895835, "learning_rate": 0.0001, "loss": 4.0655, "loss/crossentropy": 1.8461318016052246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17811527848243713, "step": 19164 }, { "epoch": 0.38332, "grad_norm": 1.984375, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 4.0408, "loss/crossentropy": 2.102015793323517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21166063100099564, "step": 19166 }, { "epoch": 0.38336, "grad_norm": 1.8359375, "grad_norm_var": 0.020167795817057292, "learning_rate": 0.0001, "loss": 3.9572, "loss/crossentropy": 1.9496545791625977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17148377001285553, "step": 19168 }, { "epoch": 0.3834, "grad_norm": 1.890625, "grad_norm_var": 0.020643870035807293, "learning_rate": 0.0001, "loss": 4.0675, "loss/crossentropy": 2.059878885746002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984856054186821, "step": 19170 }, { "epoch": 0.38344, "grad_norm": 1.8359375, "grad_norm_var": 0.017923990885416668, "learning_rate": 0.0001, "loss": 3.6476, "loss/crossentropy": 1.7966619729995728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17823103070259094, "step": 19172 }, { "epoch": 0.38348, "grad_norm": 1.8359375, "grad_norm_var": 0.01645075480143229, "learning_rate": 0.0001, "loss": 3.6671, "loss/crossentropy": 2.0173062086105347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19681749492883682, "step": 19174 }, { "epoch": 0.38352, "grad_norm": 1.7265625, "grad_norm_var": 0.019108072916666666, "learning_rate": 0.0001, "loss": 3.6439, "loss/crossentropy": 1.6330417394638062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17260564863681793, "step": 19176 }, { "epoch": 0.38356, "grad_norm": 1.8984375, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 3.8957, "loss/crossentropy": 1.6575458645820618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17257437109947205, "step": 19178 }, { "epoch": 0.3836, "grad_norm": 2.0625, "grad_norm_var": 0.006266021728515625, "learning_rate": 0.0001, "loss": 3.8633, "loss/crossentropy": 2.094203770160675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039681375026703, "step": 19180 }, { "epoch": 0.38364, "grad_norm": 1.9921875, "grad_norm_var": 0.006306966145833333, "learning_rate": 0.0001, "loss": 3.9579, "loss/crossentropy": 1.9561032056808472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19183862209320068, "step": 19182 }, { "epoch": 0.38368, "grad_norm": 2.609375, "grad_norm_var": 0.03765055338541667, "learning_rate": 0.0001, "loss": 4.4288, "loss/crossentropy": 1.950503408908844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198556549847126, "step": 19184 }, { "epoch": 0.38372, "grad_norm": 1.9453125, "grad_norm_var": 0.0398193359375, "learning_rate": 0.0001, "loss": 4.2354, "loss/crossentropy": 2.421600103378296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208856999874115, "step": 19186 }, { "epoch": 0.38376, "grad_norm": 1.96875, "grad_norm_var": 0.04036356608072917, "learning_rate": 0.0001, "loss": 3.9318, "loss/crossentropy": 1.7086477279663086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19327964633703232, "step": 19188 }, { "epoch": 0.3838, "grad_norm": 2.171875, "grad_norm_var": 0.041071573893229164, "learning_rate": 0.0001, "loss": 4.1488, "loss/crossentropy": 2.1670665740966797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19333725422620773, "step": 19190 }, { "epoch": 0.38384, "grad_norm": 1.9609375, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 4.3145, "loss/crossentropy": 2.218670129776001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20835422724485397, "step": 19192 }, { "epoch": 0.38388, "grad_norm": 1.828125, "grad_norm_var": 0.037699127197265626, "learning_rate": 0.0001, "loss": 3.7753, "loss/crossentropy": 1.909380555152893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18018050491809845, "step": 19194 }, { "epoch": 0.38392, "grad_norm": 1.953125, "grad_norm_var": 0.038211822509765625, "learning_rate": 0.0001, "loss": 4.1049, "loss/crossentropy": 1.9840248227119446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19120849668979645, "step": 19196 }, { "epoch": 0.38396, "grad_norm": 2.265625, "grad_norm_var": 0.042012532552083336, "learning_rate": 0.0001, "loss": 4.4124, "loss/crossentropy": 2.3527809381484985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20009320974349976, "step": 19198 }, { "epoch": 0.384, "grad_norm": 1.96875, "grad_norm_var": 0.0181793212890625, "learning_rate": 0.0001, "loss": 4.1011, "loss/crossentropy": 2.2111377716064453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19603972136974335, "step": 19200 }, { "epoch": 0.38404, "grad_norm": 1.9375, "grad_norm_var": 0.01685358683268229, "learning_rate": 0.0001, "loss": 3.9995, "loss/crossentropy": 1.6832327842712402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18738068640232086, "step": 19202 }, { "epoch": 0.38408, "grad_norm": 2.046875, "grad_norm_var": 0.015805816650390624, "learning_rate": 0.0001, "loss": 3.9789, "loss/crossentropy": 1.9412715435028076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1835481896996498, "step": 19204 }, { "epoch": 0.38412, "grad_norm": 2.046875, "grad_norm_var": 0.013451131184895833, "learning_rate": 0.0001, "loss": 4.203, "loss/crossentropy": 2.0641706585884094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20165929198265076, "step": 19206 }, { "epoch": 0.38416, "grad_norm": 1.9765625, "grad_norm_var": 0.011400349934895833, "learning_rate": 0.0001, "loss": 4.0054, "loss/crossentropy": 2.2053582668304443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108134999871254, "step": 19208 }, { "epoch": 0.3842, "grad_norm": 1.828125, "grad_norm_var": 0.010015614827473958, "learning_rate": 0.0001, "loss": 3.8549, "loss/crossentropy": 2.0534805059432983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19088375568389893, "step": 19210 }, { "epoch": 0.38424, "grad_norm": 1.8203125, "grad_norm_var": 0.010969034830729167, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 1.8782867789268494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17865915596485138, "step": 19212 }, { "epoch": 0.38428, "grad_norm": 2.078125, "grad_norm_var": 0.0059201558430989586, "learning_rate": 0.0001, "loss": 4.0641, "loss/crossentropy": 1.9773831963539124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20622816681861877, "step": 19214 }, { "epoch": 0.38432, "grad_norm": 2.078125, "grad_norm_var": 0.0067942301432291664, "learning_rate": 0.0001, "loss": 4.1479, "loss/crossentropy": 2.0890401005744934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2173013687133789, "step": 19216 }, { "epoch": 0.38436, "grad_norm": 1.90625, "grad_norm_var": 0.008676910400390625, "learning_rate": 0.0001, "loss": 3.7221, "loss/crossentropy": 1.858555793762207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15820113569498062, "step": 19218 }, { "epoch": 0.3844, "grad_norm": 2.078125, "grad_norm_var": 0.00911865234375, "learning_rate": 0.0001, "loss": 3.9924, "loss/crossentropy": 2.2149851322174072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2221473827958107, "step": 19220 }, { "epoch": 0.38444, "grad_norm": 1.8515625, "grad_norm_var": 0.009490712483723959, "learning_rate": 0.0001, "loss": 3.7057, "loss/crossentropy": 2.086311161518097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1791597157716751, "step": 19222 }, { "epoch": 0.38448, "grad_norm": 1.90625, "grad_norm_var": 0.009112294514973958, "learning_rate": 0.0001, "loss": 3.9117, "loss/crossentropy": 1.810127079486847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018662691116333, "step": 19224 }, { "epoch": 0.38452, "grad_norm": 1.984375, "grad_norm_var": 0.009064737955729167, "learning_rate": 0.0001, "loss": 4.3156, "loss/crossentropy": 2.132863163948059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22217299789190292, "step": 19226 }, { "epoch": 0.38456, "grad_norm": 1.8828125, "grad_norm_var": 0.008299763997395833, "learning_rate": 0.0001, "loss": 3.8697, "loss/crossentropy": 2.0617652535438538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968286782503128, "step": 19228 }, { "epoch": 0.3846, "grad_norm": 1.8828125, "grad_norm_var": 0.007250722249348958, "learning_rate": 0.0001, "loss": 3.8566, "loss/crossentropy": 1.9286837577819824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20616496354341507, "step": 19230 }, { "epoch": 0.38464, "grad_norm": 2.125, "grad_norm_var": 0.008150227864583333, "learning_rate": 0.0001, "loss": 4.2854, "loss/crossentropy": 2.009112238883972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19038759917020798, "step": 19232 }, { "epoch": 0.38468, "grad_norm": 1.8984375, "grad_norm_var": 0.006818644205729167, "learning_rate": 0.0001, "loss": 4.0879, "loss/crossentropy": 2.1937737464904785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20164573192596436, "step": 19234 }, { "epoch": 0.38472, "grad_norm": 1.9296875, "grad_norm_var": 0.00587158203125, "learning_rate": 0.0001, "loss": 4.2356, "loss/crossentropy": 1.981977641582489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20445218682289124, "step": 19236 }, { "epoch": 0.38476, "grad_norm": 1.9609375, "grad_norm_var": 0.004648590087890625, "learning_rate": 0.0001, "loss": 3.9515, "loss/crossentropy": 1.6332372426986694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1629931926727295, "step": 19238 }, { "epoch": 0.3848, "grad_norm": 1.9375, "grad_norm_var": 0.003952789306640625, "learning_rate": 0.0001, "loss": 3.9061, "loss/crossentropy": 1.8656854629516602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18546659499406815, "step": 19240 }, { "epoch": 0.38484, "grad_norm": 2.09375, "grad_norm_var": 0.005594635009765625, "learning_rate": 0.0001, "loss": 4.1251, "loss/crossentropy": 2.219391703605652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21327267587184906, "step": 19242 }, { "epoch": 0.38488, "grad_norm": 2.171875, "grad_norm_var": 0.007883453369140625, "learning_rate": 0.0001, "loss": 4.448, "loss/crossentropy": 1.9518468976020813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23419301211833954, "step": 19244 }, { "epoch": 0.38492, "grad_norm": 1.9609375, "grad_norm_var": 0.007411448160807291, "learning_rate": 0.0001, "loss": 4.2947, "loss/crossentropy": 2.190311014652252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218230664730072, "step": 19246 }, { "epoch": 0.38496, "grad_norm": 1.765625, "grad_norm_var": 0.009633127848307292, "learning_rate": 0.0001, "loss": 3.8475, "loss/crossentropy": 1.8347881436347961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20036222785711288, "step": 19248 }, { "epoch": 0.385, "grad_norm": 2.0625, "grad_norm_var": 0.012157185872395834, "learning_rate": 0.0001, "loss": 3.9732, "loss/crossentropy": 1.949516236782074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20307819545269012, "step": 19250 }, { "epoch": 0.38504, "grad_norm": 1.9375, "grad_norm_var": 0.012892405192057291, "learning_rate": 0.0001, "loss": 4.007, "loss/crossentropy": 1.883664846420288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1825740560889244, "step": 19252 }, { "epoch": 0.38508, "grad_norm": 2.140625, "grad_norm_var": 0.0146881103515625, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 2.1245557069778442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20935190469026566, "step": 19254 }, { "epoch": 0.38512, "grad_norm": 2.078125, "grad_norm_var": 0.015166982014973959, "learning_rate": 0.0001, "loss": 3.9125, "loss/crossentropy": 1.546354353427887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15509265661239624, "step": 19256 }, { "epoch": 0.38516, "grad_norm": 1.96875, "grad_norm_var": 0.014290110270182291, "learning_rate": 0.0001, "loss": 4.1744, "loss/crossentropy": 1.979533076286316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20994720607995987, "step": 19258 }, { "epoch": 0.3852, "grad_norm": 1.984375, "grad_norm_var": 0.014074452718098958, "learning_rate": 0.0001, "loss": 3.7774, "loss/crossentropy": 2.125569224357605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21175500005483627, "step": 19260 }, { "epoch": 0.38524, "grad_norm": 1.859375, "grad_norm_var": 0.0124267578125, "learning_rate": 0.0001, "loss": 3.911, "loss/crossentropy": 2.027758777141571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20255477726459503, "step": 19262 }, { "epoch": 0.38528, "grad_norm": 1.84375, "grad_norm_var": 0.011058553059895834, "learning_rate": 0.0001, "loss": 3.4728, "loss/crossentropy": 1.9937690496444702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836088001728058, "step": 19264 }, { "epoch": 0.38532, "grad_norm": 1.921875, "grad_norm_var": 0.00863037109375, "learning_rate": 0.0001, "loss": 4.1797, "loss/crossentropy": 2.0846773386001587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18216369301080704, "step": 19266 }, { "epoch": 0.38536, "grad_norm": 1.984375, "grad_norm_var": 0.008429972330729167, "learning_rate": 0.0001, "loss": 3.8911, "loss/crossentropy": 1.445238471031189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17925872653722763, "step": 19268 }, { "epoch": 0.3854, "grad_norm": 1.9921875, "grad_norm_var": 0.0061757405598958336, "learning_rate": 0.0001, "loss": 4.0615, "loss/crossentropy": 2.0442845821380615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024901956319809, "step": 19270 }, { "epoch": 0.38544, "grad_norm": 1.7265625, "grad_norm_var": 0.009144846598307292, "learning_rate": 0.0001, "loss": 4.0898, "loss/crossentropy": 1.9336092472076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17869817465543747, "step": 19272 }, { "epoch": 0.38548, "grad_norm": 2.125, "grad_norm_var": 0.010884348551432292, "learning_rate": 0.0001, "loss": 4.1939, "loss/crossentropy": 1.815299928188324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182998888194561, "step": 19274 }, { "epoch": 0.38552, "grad_norm": 2.0, "grad_norm_var": 0.008886464436848958, "learning_rate": 0.0001, "loss": 4.2121, "loss/crossentropy": 2.091490864753723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19843900948762894, "step": 19276 }, { "epoch": 0.38556, "grad_norm": 2.03125, "grad_norm_var": 0.009608713785807292, "learning_rate": 0.0001, "loss": 4.098, "loss/crossentropy": 1.9923794865608215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186911903321743, "step": 19278 }, { "epoch": 0.3856, "grad_norm": 1.828125, "grad_norm_var": 0.009867350260416666, "learning_rate": 0.0001, "loss": 4.0658, "loss/crossentropy": 1.9323501586914062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931384950876236, "step": 19280 }, { "epoch": 0.38564, "grad_norm": 1.875, "grad_norm_var": 0.010262044270833333, "learning_rate": 0.0001, "loss": 3.8819, "loss/crossentropy": 1.8469224572181702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1835423856973648, "step": 19282 }, { "epoch": 0.38568, "grad_norm": 1.859375, "grad_norm_var": 0.010453287760416667, "learning_rate": 0.0001, "loss": 3.9027, "loss/crossentropy": 1.9476045370101929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17014781385660172, "step": 19284 }, { "epoch": 0.38572, "grad_norm": 1.9296875, "grad_norm_var": 0.010285441080729167, "learning_rate": 0.0001, "loss": 3.9943, "loss/crossentropy": 1.7913609743118286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18661632388830185, "step": 19286 }, { "epoch": 0.38576, "grad_norm": 1.8046875, "grad_norm_var": 0.00760498046875, "learning_rate": 0.0001, "loss": 4.0283, "loss/crossentropy": 1.9682837128639221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799641102552414, "step": 19288 }, { "epoch": 0.3858, "grad_norm": 1.8984375, "grad_norm_var": 0.006331125895182292, "learning_rate": 0.0001, "loss": 4.3075, "loss/crossentropy": 2.117633819580078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208334282040596, "step": 19290 }, { "epoch": 0.38584, "grad_norm": 1.890625, "grad_norm_var": 0.005761464436848958, "learning_rate": 0.0001, "loss": 4.1215, "loss/crossentropy": 2.1208746433258057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20928698778152466, "step": 19292 }, { "epoch": 0.38588, "grad_norm": 1.8046875, "grad_norm_var": 0.005549112955729167, "learning_rate": 0.0001, "loss": 4.0572, "loss/crossentropy": 1.9424505829811096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18668201565742493, "step": 19294 }, { "epoch": 0.38592, "grad_norm": 1.921875, "grad_norm_var": 0.005277252197265625, "learning_rate": 0.0001, "loss": 4.1575, "loss/crossentropy": 2.251755177974701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21153685450553894, "step": 19296 }, { "epoch": 0.38596, "grad_norm": 2.078125, "grad_norm_var": 0.007114410400390625, "learning_rate": 0.0001, "loss": 4.1885, "loss/crossentropy": 2.1629436016082764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206063412129879, "step": 19298 }, { "epoch": 0.386, "grad_norm": 1.9765625, "grad_norm_var": 0.0069732666015625, "learning_rate": 0.0001, "loss": 4.1267, "loss/crossentropy": 2.0333832502365112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872929260134697, "step": 19300 }, { "epoch": 0.38604, "grad_norm": 1.890625, "grad_norm_var": 0.007641347249348959, "learning_rate": 0.0001, "loss": 3.8433, "loss/crossentropy": 1.8364217281341553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17408161610364914, "step": 19302 }, { "epoch": 0.38608, "grad_norm": 1.890625, "grad_norm_var": 0.005973307291666666, "learning_rate": 0.0001, "loss": 3.9175, "loss/crossentropy": 1.975037932395935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18705998361110687, "step": 19304 }, { "epoch": 0.38612, "grad_norm": 1.7890625, "grad_norm_var": 0.006477864583333334, "learning_rate": 0.0001, "loss": 3.7868, "loss/crossentropy": 1.6889175176620483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16312924772500992, "step": 19306 }, { "epoch": 0.38616, "grad_norm": 1.9375, "grad_norm_var": 0.006392161051432292, "learning_rate": 0.0001, "loss": 4.1299, "loss/crossentropy": 2.2644211053848267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18390889465808868, "step": 19308 }, { "epoch": 0.3862, "grad_norm": 2.1875, "grad_norm_var": 0.009723917643229166, "learning_rate": 0.0001, "loss": 4.325, "loss/crossentropy": 2.180909037590027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20263531804084778, "step": 19310 }, { "epoch": 0.38624, "grad_norm": 2.015625, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 4.2001, "loss/crossentropy": 2.4007444381713867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23251044005155563, "step": 19312 }, { "epoch": 0.38628, "grad_norm": 2.0, "grad_norm_var": 0.010534413655598958, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 1.8122236728668213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18613354116678238, "step": 19314 }, { "epoch": 0.38632, "grad_norm": 1.8828125, "grad_norm_var": 0.010404459635416667, "learning_rate": 0.0001, "loss": 4.1379, "loss/crossentropy": 2.159746825695038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21711497008800507, "step": 19316 }, { "epoch": 0.38636, "grad_norm": 1.953125, "grad_norm_var": 0.009627024332682291, "learning_rate": 0.0001, "loss": 3.9929, "loss/crossentropy": 2.129119336605072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19623322039842606, "step": 19318 }, { "epoch": 0.3864, "grad_norm": 1.96875, "grad_norm_var": 0.009934234619140624, "learning_rate": 0.0001, "loss": 4.1578, "loss/crossentropy": 2.322758913040161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20978064835071564, "step": 19320 }, { "epoch": 0.38644, "grad_norm": 2.015625, "grad_norm_var": 0.010322825113932291, "learning_rate": 0.0001, "loss": 4.0266, "loss/crossentropy": 2.2643767595291138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21338346600532532, "step": 19322 }, { "epoch": 0.38648, "grad_norm": 1.8828125, "grad_norm_var": 0.010731760660807292, "learning_rate": 0.0001, "loss": 4.0614, "loss/crossentropy": 1.8038535118103027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19042538851499557, "step": 19324 }, { "epoch": 0.38652, "grad_norm": 1.9609375, "grad_norm_var": 0.0069620768229166664, "learning_rate": 0.0001, "loss": 3.9641, "loss/crossentropy": 2.0847853422164917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1965423971414566, "step": 19326 }, { "epoch": 0.38656, "grad_norm": 2.015625, "grad_norm_var": 0.007692209879557292, "learning_rate": 0.0001, "loss": 4.3347, "loss/crossentropy": 2.2121591567993164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21294504404067993, "step": 19328 }, { "epoch": 0.3866, "grad_norm": 1.8671875, "grad_norm_var": 0.005985260009765625, "learning_rate": 0.0001, "loss": 3.9869, "loss/crossentropy": 2.100327968597412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20181559026241302, "step": 19330 }, { "epoch": 0.38664, "grad_norm": 1.78125, "grad_norm_var": 0.00750732421875, "learning_rate": 0.0001, "loss": 3.642, "loss/crossentropy": 1.7100898623466492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18045856803655624, "step": 19332 }, { "epoch": 0.38668, "grad_norm": 1.953125, "grad_norm_var": 0.008337148030598958, "learning_rate": 0.0001, "loss": 3.712, "loss/crossentropy": 1.6435258388519287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821775734424591, "step": 19334 }, { "epoch": 0.38672, "grad_norm": 1.8828125, "grad_norm_var": 0.008674875895182291, "learning_rate": 0.0001, "loss": 3.8334, "loss/crossentropy": 2.338008165359497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040715217590332, "step": 19336 }, { "epoch": 0.38676, "grad_norm": 1.78125, "grad_norm_var": 0.008454386393229167, "learning_rate": 0.0001, "loss": 3.8911, "loss/crossentropy": 1.8373408913612366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17516936361789703, "step": 19338 }, { "epoch": 0.3868, "grad_norm": 1.921875, "grad_norm_var": 0.008422597249348959, "learning_rate": 0.0001, "loss": 3.9902, "loss/crossentropy": 2.0346588492393494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20979416370391846, "step": 19340 }, { "epoch": 0.38684, "grad_norm": 2.078125, "grad_norm_var": 0.009399159749348959, "learning_rate": 0.0001, "loss": 4.1759, "loss/crossentropy": 2.083326816558838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20702984929084778, "step": 19342 }, { "epoch": 0.38688, "grad_norm": 2.0, "grad_norm_var": 0.012473297119140626, "learning_rate": 0.0001, "loss": 4.611, "loss/crossentropy": 2.4100319147109985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2617071568965912, "step": 19344 }, { "epoch": 0.38692, "grad_norm": 1.96875, "grad_norm_var": 0.012550608317057291, "learning_rate": 0.0001, "loss": 3.8864, "loss/crossentropy": 1.7147992849349976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.174342080950737, "step": 19346 }, { "epoch": 0.38696, "grad_norm": 2.046875, "grad_norm_var": 0.013602701822916667, "learning_rate": 0.0001, "loss": 4.1508, "loss/crossentropy": 2.041933536529541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024158239364624, "step": 19348 }, { "epoch": 0.387, "grad_norm": 1.921875, "grad_norm_var": 0.012434895833333333, "learning_rate": 0.0001, "loss": 4.0929, "loss/crossentropy": 1.835128128528595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861354559659958, "step": 19350 }, { "epoch": 0.38704, "grad_norm": 2.15625, "grad_norm_var": 0.013370768229166666, "learning_rate": 0.0001, "loss": 4.164, "loss/crossentropy": 1.9472790360450745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19485513865947723, "step": 19352 }, { "epoch": 0.38708, "grad_norm": 1.8359375, "grad_norm_var": 0.012428538004557291, "learning_rate": 0.0001, "loss": 4.0217, "loss/crossentropy": 2.0546197295188904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1883516013622284, "step": 19354 }, { "epoch": 0.38712, "grad_norm": 1.8359375, "grad_norm_var": 0.013169097900390624, "learning_rate": 0.0001, "loss": 3.6439, "loss/crossentropy": 1.8285245299339294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1730484738945961, "step": 19356 }, { "epoch": 0.38716, "grad_norm": 1.984375, "grad_norm_var": 0.013890584309895834, "learning_rate": 0.0001, "loss": 4.2169, "loss/crossentropy": 2.320949673652649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21517502516508102, "step": 19358 }, { "epoch": 0.3872, "grad_norm": 2.03125, "grad_norm_var": 0.011289215087890625, "learning_rate": 0.0001, "loss": 4.0196, "loss/crossentropy": 2.2944518327713013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20343206077814102, "step": 19360 }, { "epoch": 0.38724, "grad_norm": 1.96875, "grad_norm_var": 0.0112945556640625, "learning_rate": 0.0001, "loss": 4.2113, "loss/crossentropy": 2.134889602661133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19354654103517532, "step": 19362 }, { "epoch": 0.38728, "grad_norm": 1.984375, "grad_norm_var": 0.009639231363932292, "learning_rate": 0.0001, "loss": 4.1244, "loss/crossentropy": 2.206334412097931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20670340210199356, "step": 19364 }, { "epoch": 0.38732, "grad_norm": 2.0, "grad_norm_var": 0.009769694010416666, "learning_rate": 0.0001, "loss": 3.8194, "loss/crossentropy": 2.1286932229995728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19814791530370712, "step": 19366 }, { "epoch": 0.38736, "grad_norm": 1.984375, "grad_norm_var": 0.0066314697265625, "learning_rate": 0.0001, "loss": 4.1533, "loss/crossentropy": 2.119162678718567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18044983595609665, "step": 19368 }, { "epoch": 0.3874, "grad_norm": 1.9453125, "grad_norm_var": 0.005060831705729167, "learning_rate": 0.0001, "loss": 4.0061, "loss/crossentropy": 2.4316102266311646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22509240359067917, "step": 19370 }, { "epoch": 0.38744, "grad_norm": 1.8046875, "grad_norm_var": 0.0060117085774739586, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 2.540325403213501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22844604402780533, "step": 19372 }, { "epoch": 0.38748, "grad_norm": 2.078125, "grad_norm_var": 0.005782063802083333, "learning_rate": 0.0001, "loss": 4.1795, "loss/crossentropy": 2.3002058267593384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21805168688297272, "step": 19374 }, { "epoch": 0.38752, "grad_norm": 2.03125, "grad_norm_var": 0.0058258056640625, "learning_rate": 0.0001, "loss": 4.274, "loss/crossentropy": 2.2381847500801086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19565512239933014, "step": 19376 }, { "epoch": 0.38756, "grad_norm": 2.046875, "grad_norm_var": 0.0060302734375, "learning_rate": 0.0001, "loss": 4.2683, "loss/crossentropy": 2.101936161518097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198813758790493, "step": 19378 }, { "epoch": 0.3876, "grad_norm": 1.8984375, "grad_norm_var": 0.0060943603515625, "learning_rate": 0.0001, "loss": 3.8679, "loss/crossentropy": 2.1088311672210693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21237140893936157, "step": 19380 }, { "epoch": 0.38764, "grad_norm": 2.0, "grad_norm_var": 0.0060943603515625, "learning_rate": 0.0001, "loss": 3.9588, "loss/crossentropy": 2.0985374450683594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18979168683290482, "step": 19382 }, { "epoch": 0.38768, "grad_norm": 1.9921875, "grad_norm_var": 0.005690256754557292, "learning_rate": 0.0001, "loss": 4.0798, "loss/crossentropy": 2.3004449605941772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053247168660164, "step": 19384 }, { "epoch": 0.38772, "grad_norm": 1.9375, "grad_norm_var": 0.005277252197265625, "learning_rate": 0.0001, "loss": 4.0285, "loss/crossentropy": 2.1257707476615906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22222986072301865, "step": 19386 }, { "epoch": 0.38776, "grad_norm": 1.9765625, "grad_norm_var": 0.003733062744140625, "learning_rate": 0.0001, "loss": 4.1336, "loss/crossentropy": 2.3034613132476807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2174137905240059, "step": 19388 }, { "epoch": 0.3878, "grad_norm": 1.7578125, "grad_norm_var": 0.006245930989583333, "learning_rate": 0.0001, "loss": 3.9854, "loss/crossentropy": 2.1303011178970337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22152414172887802, "step": 19390 }, { "epoch": 0.38784, "grad_norm": 2.0, "grad_norm_var": 0.010412343343098958, "learning_rate": 0.0001, "loss": 4.1868, "loss/crossentropy": 1.8575092554092407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18628983944654465, "step": 19392 }, { "epoch": 0.38788, "grad_norm": 2.109375, "grad_norm_var": 0.011694081624348958, "learning_rate": 0.0001, "loss": 4.1219, "loss/crossentropy": 2.0332913994789124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2499927133321762, "step": 19394 }, { "epoch": 0.38792, "grad_norm": 2.125, "grad_norm_var": 0.013004302978515625, "learning_rate": 0.0001, "loss": 4.2275, "loss/crossentropy": 2.0926660895347595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19264871627092361, "step": 19396 }, { "epoch": 0.38796, "grad_norm": 2.21875, "grad_norm_var": 0.0156005859375, "learning_rate": 0.0001, "loss": 4.2337, "loss/crossentropy": 2.08814400434494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137948334217072, "step": 19398 }, { "epoch": 0.388, "grad_norm": 1.9921875, "grad_norm_var": 0.019162750244140624, "learning_rate": 0.0001, "loss": 4.0263, "loss/crossentropy": 2.074121594429016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20524942874908447, "step": 19400 }, { "epoch": 0.38804, "grad_norm": 1.8671875, "grad_norm_var": 0.020589192708333332, "learning_rate": 0.0001, "loss": 3.9437, "loss/crossentropy": 1.8818755745887756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1794123500585556, "step": 19402 }, { "epoch": 0.38808, "grad_norm": 1.890625, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 3.8638, "loss/crossentropy": 2.5050524473190308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22337081283330917, "step": 19404 }, { "epoch": 0.38812, "grad_norm": 2.03125, "grad_norm_var": 0.0180572509765625, "learning_rate": 0.0001, "loss": 4.2158, "loss/crossentropy": 2.2035861015319824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20753300189971924, "step": 19406 }, { "epoch": 0.38816, "grad_norm": 1.90625, "grad_norm_var": 0.014012654622395834, "learning_rate": 0.0001, "loss": 3.8975, "loss/crossentropy": 2.2270091772079468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20527157932519913, "step": 19408 }, { "epoch": 0.3882, "grad_norm": 1.8828125, "grad_norm_var": 0.013504791259765624, "learning_rate": 0.0001, "loss": 3.951, "loss/crossentropy": 1.6546313762664795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17243140190839767, "step": 19410 }, { "epoch": 0.38824, "grad_norm": 1.9140625, "grad_norm_var": 0.012756093343098959, "learning_rate": 0.0001, "loss": 4.0653, "loss/crossentropy": 1.8858160376548767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18621040880680084, "step": 19412 }, { "epoch": 0.38828, "grad_norm": 2.109375, "grad_norm_var": 0.011875152587890625, "learning_rate": 0.0001, "loss": 4.4744, "loss/crossentropy": 1.9876565337181091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30138373374938965, "step": 19414 }, { "epoch": 0.38832, "grad_norm": 2.1875, "grad_norm_var": 0.013053131103515626, "learning_rate": 0.0001, "loss": 3.7802, "loss/crossentropy": 2.1887502670288086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910400092601776, "step": 19416 }, { "epoch": 0.38836, "grad_norm": 1.9453125, "grad_norm_var": 0.012511952718098959, "learning_rate": 0.0001, "loss": 3.9398, "loss/crossentropy": 2.246508002281189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018478363752365, "step": 19418 }, { "epoch": 0.3884, "grad_norm": 1.8828125, "grad_norm_var": 0.011510976155598958, "learning_rate": 0.0001, "loss": 4.1703, "loss/crossentropy": 2.041202425956726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1890542358160019, "step": 19420 }, { "epoch": 0.38844, "grad_norm": 2.09375, "grad_norm_var": 0.016904449462890624, "learning_rate": 0.0001, "loss": 4.4154, "loss/crossentropy": 2.0470627546310425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19648576527833939, "step": 19422 }, { "epoch": 0.38848, "grad_norm": 1.9765625, "grad_norm_var": 0.0155181884765625, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 2.172736167907715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22087720036506653, "step": 19424 }, { "epoch": 0.38852, "grad_norm": 1.8828125, "grad_norm_var": 0.015811920166015625, "learning_rate": 0.0001, "loss": 4.0106, "loss/crossentropy": 2.0988662242889404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18553303182125092, "step": 19426 }, { "epoch": 0.38856, "grad_norm": 1.8359375, "grad_norm_var": 0.017134348551432293, "learning_rate": 0.0001, "loss": 4.1368, "loss/crossentropy": 1.821892261505127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19670143723487854, "step": 19428 }, { "epoch": 0.3886, "grad_norm": 1.875, "grad_norm_var": 0.016747792561848957, "learning_rate": 0.0001, "loss": 3.7878, "loss/crossentropy": 1.8893300294876099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18850861489772797, "step": 19430 }, { "epoch": 0.38864, "grad_norm": 2.984375, "grad_norm_var": 0.07888997395833333, "learning_rate": 0.0001, "loss": 4.243, "loss/crossentropy": 2.210429847240448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17908993363380432, "step": 19432 }, { "epoch": 0.38868, "grad_norm": 2.015625, "grad_norm_var": 0.07731526692708333, "learning_rate": 0.0001, "loss": 3.9939, "loss/crossentropy": 2.164771556854248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919529214501381, "step": 19434 }, { "epoch": 0.38872, "grad_norm": 1.9609375, "grad_norm_var": 0.07656631469726563, "learning_rate": 0.0001, "loss": 4.1721, "loss/crossentropy": 2.1335190534591675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2263372465968132, "step": 19436 }, { "epoch": 0.38876, "grad_norm": 1.9609375, "grad_norm_var": 0.073779296875, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 2.199760317802429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20560061931610107, "step": 19438 }, { "epoch": 0.3888, "grad_norm": 1.8125, "grad_norm_var": 0.07665608723958334, "learning_rate": 0.0001, "loss": 3.9367, "loss/crossentropy": 2.2948896884918213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016046792268753, "step": 19440 }, { "epoch": 0.38884, "grad_norm": 1.921875, "grad_norm_var": 0.07588882446289062, "learning_rate": 0.0001, "loss": 4.1444, "loss/crossentropy": 2.120785415172577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18029560148715973, "step": 19442 }, { "epoch": 0.38888, "grad_norm": 1.875, "grad_norm_var": 0.07493260701497396, "learning_rate": 0.0001, "loss": 3.8872, "loss/crossentropy": 2.049036145210266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19056373089551926, "step": 19444 }, { "epoch": 0.38892, "grad_norm": 2.078125, "grad_norm_var": 0.07226130167643229, "learning_rate": 0.0001, "loss": 4.2652, "loss/crossentropy": 2.0076504945755005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20951516926288605, "step": 19446 }, { "epoch": 0.38896, "grad_norm": 1.9296875, "grad_norm_var": 0.006740061442057291, "learning_rate": 0.0001, "loss": 3.8178, "loss/crossentropy": 1.9398083090782166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21217594295740128, "step": 19448 }, { "epoch": 0.389, "grad_norm": 1.875, "grad_norm_var": 0.00546875, "learning_rate": 0.0001, "loss": 4.0245, "loss/crossentropy": 2.0976104736328125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19852425903081894, "step": 19450 }, { "epoch": 0.38904, "grad_norm": 7.625, "grad_norm_var": 2.023509724934896, "learning_rate": 0.0001, "loss": 3.9741, "loss/crossentropy": 1.8561761379241943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17980807274580002, "step": 19452 }, { "epoch": 0.38908, "grad_norm": 1.859375, "grad_norm_var": 2.0183570861816404, "learning_rate": 0.0001, "loss": 3.913, "loss/crossentropy": 2.1107255816459656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20205769687891006, "step": 19454 }, { "epoch": 0.38912, "grad_norm": 1.8203125, "grad_norm_var": 2.009368642171224, "learning_rate": 0.0001, "loss": 3.9216, "loss/crossentropy": 2.018397331237793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18721628934144974, "step": 19456 }, { "epoch": 0.38916, "grad_norm": 1.84375, "grad_norm_var": 2.010087076822917, "learning_rate": 0.0001, "loss": 3.8738, "loss/crossentropy": 2.016912341117859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20780544728040695, "step": 19458 }, { "epoch": 0.3892, "grad_norm": 1.8671875, "grad_norm_var": 2.0083984375, "learning_rate": 0.0001, "loss": 3.9523, "loss/crossentropy": 1.945086121559143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19544267654418945, "step": 19460 }, { "epoch": 0.38924, "grad_norm": 1.9375, "grad_norm_var": 2.01300048828125, "learning_rate": 0.0001, "loss": 4.1854, "loss/crossentropy": 2.059622883796692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21501117944717407, "step": 19462 }, { "epoch": 0.38928, "grad_norm": 1.96875, "grad_norm_var": 2.024466705322266, "learning_rate": 0.0001, "loss": 3.8336, "loss/crossentropy": 1.8915096521377563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1852220892906189, "step": 19464 }, { "epoch": 0.38932, "grad_norm": 2.015625, "grad_norm_var": 2.0162534077962238, "learning_rate": 0.0001, "loss": 4.0612, "loss/crossentropy": 1.5394552946090698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16873380541801453, "step": 19466 }, { "epoch": 0.38936, "grad_norm": 1.921875, "grad_norm_var": 0.03472468058268229, "learning_rate": 0.0001, "loss": 4.2426, "loss/crossentropy": 2.0290380716323853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19913609325885773, "step": 19468 }, { "epoch": 0.3894, "grad_norm": 2.03125, "grad_norm_var": 0.033841705322265624, "learning_rate": 0.0001, "loss": 4.1607, "loss/crossentropy": 2.208653211593628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190569207072258, "step": 19470 }, { "epoch": 0.38944, "grad_norm": 1.9375, "grad_norm_var": 0.0092041015625, "learning_rate": 0.0001, "loss": 4.2646, "loss/crossentropy": 2.2873799800872803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20589639246463776, "step": 19472 }, { "epoch": 0.38948, "grad_norm": 1.953125, "grad_norm_var": 0.008014933268229166, "learning_rate": 0.0001, "loss": 4.1662, "loss/crossentropy": 2.152024030685425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20272599160671234, "step": 19474 }, { "epoch": 0.38952, "grad_norm": 1.96875, "grad_norm_var": 0.009374745686848958, "learning_rate": 0.0001, "loss": 3.9932, "loss/crossentropy": 2.0079659819602966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19110675901174545, "step": 19476 }, { "epoch": 0.38956, "grad_norm": 2.078125, "grad_norm_var": 0.010081990559895834, "learning_rate": 0.0001, "loss": 4.1061, "loss/crossentropy": 2.3006476163864136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2253967523574829, "step": 19478 }, { "epoch": 0.3896, "grad_norm": 1.8359375, "grad_norm_var": 0.010389963785807291, "learning_rate": 0.0001, "loss": 4.0678, "loss/crossentropy": 2.1429306864738464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20268720388412476, "step": 19480 }, { "epoch": 0.38964, "grad_norm": 2.03125, "grad_norm_var": 0.011417643229166666, "learning_rate": 0.0001, "loss": 4.0581, "loss/crossentropy": 2.260764956474304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121199518442154, "step": 19482 }, { "epoch": 0.38968, "grad_norm": 2.015625, "grad_norm_var": 0.010636393229166667, "learning_rate": 0.0001, "loss": 4.1607, "loss/crossentropy": 2.3874053955078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229148969054222, "step": 19484 }, { "epoch": 0.38972, "grad_norm": 2.140625, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.1456, "loss/crossentropy": 2.2740933895111084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2509172707796097, "step": 19486 }, { "epoch": 0.38976, "grad_norm": 1.921875, "grad_norm_var": 0.009679921468098958, "learning_rate": 0.0001, "loss": 4.1831, "loss/crossentropy": 2.1949650049209595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19400373846292496, "step": 19488 }, { "epoch": 0.3898, "grad_norm": 1.96875, "grad_norm_var": 0.009549713134765625, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 1.4600969552993774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15833207219839096, "step": 19490 }, { "epoch": 0.38984, "grad_norm": 2.453125, "grad_norm_var": 0.019913482666015624, "learning_rate": 0.0001, "loss": 4.1694, "loss/crossentropy": 2.0005985498428345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20003372430801392, "step": 19492 }, { "epoch": 0.38988, "grad_norm": 1.84375, "grad_norm_var": 0.020304107666015626, "learning_rate": 0.0001, "loss": 3.8336, "loss/crossentropy": 1.9609830379486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19764738529920578, "step": 19494 }, { "epoch": 0.38992, "grad_norm": 1.8984375, "grad_norm_var": 0.01943359375, "learning_rate": 0.0001, "loss": 3.8708, "loss/crossentropy": 1.7853738069534302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18979863077402115, "step": 19496 }, { "epoch": 0.38996, "grad_norm": 1.953125, "grad_norm_var": 0.018961588541666668, "learning_rate": 0.0001, "loss": 4.0335, "loss/crossentropy": 2.004499912261963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20765355974435806, "step": 19498 }, { "epoch": 0.39, "grad_norm": 2.015625, "grad_norm_var": 0.019870758056640625, "learning_rate": 0.0001, "loss": 4.1277, "loss/crossentropy": 2.0148558020591736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18088079243898392, "step": 19500 }, { "epoch": 0.39004, "grad_norm": 2.140625, "grad_norm_var": 0.022965240478515624, "learning_rate": 0.0001, "loss": 4.1213, "loss/crossentropy": 2.2602895498275757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22686412185430527, "step": 19502 }, { "epoch": 0.39008, "grad_norm": 1.9921875, "grad_norm_var": 0.024326324462890625, "learning_rate": 0.0001, "loss": 4.0541, "loss/crossentropy": 2.1789051294326782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1989632546901703, "step": 19504 }, { "epoch": 0.39012, "grad_norm": 2.03125, "grad_norm_var": 0.024568684895833335, "learning_rate": 0.0001, "loss": 3.9234, "loss/crossentropy": 1.93650484085083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976509690284729, "step": 19506 }, { "epoch": 0.39016, "grad_norm": 1.96875, "grad_norm_var": 0.008351389567057292, "learning_rate": 0.0001, "loss": 3.9519, "loss/crossentropy": 1.9904287457466125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17271699011325836, "step": 19508 }, { "epoch": 0.3902, "grad_norm": 2.03125, "grad_norm_var": 0.008194986979166667, "learning_rate": 0.0001, "loss": 3.7009, "loss/crossentropy": 1.4440776705741882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16108233481645584, "step": 19510 }, { "epoch": 0.39024, "grad_norm": 1.984375, "grad_norm_var": 0.008111317952473959, "learning_rate": 0.0001, "loss": 4.0218, "loss/crossentropy": 2.143462061882019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024468332529068, "step": 19512 }, { "epoch": 0.39028, "grad_norm": 2.03125, "grad_norm_var": 0.008436838785807291, "learning_rate": 0.0001, "loss": 4.1267, "loss/crossentropy": 2.085771322250366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22135765105485916, "step": 19514 }, { "epoch": 0.39032, "grad_norm": 1.9296875, "grad_norm_var": 0.008337148030598958, "learning_rate": 0.0001, "loss": 4.1423, "loss/crossentropy": 2.251617908477783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026226669549942, "step": 19516 }, { "epoch": 0.39036, "grad_norm": 2.03125, "grad_norm_var": 0.003885650634765625, "learning_rate": 0.0001, "loss": 4.3156, "loss/crossentropy": 2.3808701038360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2263621687889099, "step": 19518 }, { "epoch": 0.3904, "grad_norm": 1.8203125, "grad_norm_var": 0.004019927978515625, "learning_rate": 0.0001, "loss": 4.0323, "loss/crossentropy": 2.000342011451721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20008830726146698, "step": 19520 }, { "epoch": 0.39044, "grad_norm": 2.09375, "grad_norm_var": 0.0044830322265625, "learning_rate": 0.0001, "loss": 4.3285, "loss/crossentropy": 2.242555856704712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2443876415491104, "step": 19522 }, { "epoch": 0.39048, "grad_norm": 2.0625, "grad_norm_var": 0.005060831705729167, "learning_rate": 0.0001, "loss": 4.344, "loss/crossentropy": 2.2867462635040283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21522662043571472, "step": 19524 }, { "epoch": 0.39052, "grad_norm": 1.9453125, "grad_norm_var": 0.0053955078125, "learning_rate": 0.0001, "loss": 4.0117, "loss/crossentropy": 2.1892699003219604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205935537815094, "step": 19526 }, { "epoch": 0.39056, "grad_norm": 1.890625, "grad_norm_var": 0.007972971598307291, "learning_rate": 0.0001, "loss": 3.8673, "loss/crossentropy": 2.182734966278076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980958804488182, "step": 19528 }, { "epoch": 0.3906, "grad_norm": 1.96875, "grad_norm_var": 0.0075927734375, "learning_rate": 0.0001, "loss": 4.0193, "loss/crossentropy": 2.3624355792999268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22833774983882904, "step": 19530 }, { "epoch": 0.39064, "grad_norm": 1.875, "grad_norm_var": 0.007671864827473959, "learning_rate": 0.0001, "loss": 4.0497, "loss/crossentropy": 2.037912607192993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032359093427658, "step": 19532 }, { "epoch": 0.39068, "grad_norm": 2.015625, "grad_norm_var": 0.007513173421223958, "learning_rate": 0.0001, "loss": 3.9812, "loss/crossentropy": 2.076040804386139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21829531341791153, "step": 19534 }, { "epoch": 0.39072, "grad_norm": 1.9453125, "grad_norm_var": 0.0058502197265625, "learning_rate": 0.0001, "loss": 4.0407, "loss/crossentropy": 2.2695836424827576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125643789768219, "step": 19536 }, { "epoch": 0.39076, "grad_norm": 2.078125, "grad_norm_var": 0.0061279296875, "learning_rate": 0.0001, "loss": 4.2664, "loss/crossentropy": 1.9343088269233704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045460194349289, "step": 19538 }, { "epoch": 0.3908, "grad_norm": 1.9609375, "grad_norm_var": 0.005272420247395834, "learning_rate": 0.0001, "loss": 4.0757, "loss/crossentropy": 1.9968918561935425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17788998782634735, "step": 19540 }, { "epoch": 0.39084, "grad_norm": 1.9609375, "grad_norm_var": 0.005467732747395833, "learning_rate": 0.0001, "loss": 4.094, "loss/crossentropy": 2.157910704612732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19516880810260773, "step": 19542 }, { "epoch": 0.39088, "grad_norm": 2.0, "grad_norm_var": 0.004042307535807292, "learning_rate": 0.0001, "loss": 4.1056, "loss/crossentropy": 2.1784998178482056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20280858874320984, "step": 19544 }, { "epoch": 0.39092, "grad_norm": 2.03125, "grad_norm_var": 0.00438232421875, "learning_rate": 0.0001, "loss": 4.0796, "loss/crossentropy": 2.3444844484329224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2418832629919052, "step": 19546 }, { "epoch": 0.39096, "grad_norm": 1.96875, "grad_norm_var": 0.0038083394368489585, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 1.90557062625885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010379433631897, "step": 19548 }, { "epoch": 0.391, "grad_norm": 1.90625, "grad_norm_var": 0.004107411702473958, "learning_rate": 0.0001, "loss": 3.7756, "loss/crossentropy": 1.4939787983894348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1362278200685978, "step": 19550 }, { "epoch": 0.39104, "grad_norm": 1.9140625, "grad_norm_var": 0.0063517252604166664, "learning_rate": 0.0001, "loss": 4.1738, "loss/crossentropy": 2.037553310394287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159510999917984, "step": 19552 }, { "epoch": 0.39108, "grad_norm": 1.9375, "grad_norm_var": 0.004816691080729167, "learning_rate": 0.0001, "loss": 3.887, "loss/crossentropy": 1.9987242221832275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19919036328792572, "step": 19554 }, { "epoch": 0.39112, "grad_norm": 1.90625, "grad_norm_var": 0.0048906962076822914, "learning_rate": 0.0001, "loss": 4.0298, "loss/crossentropy": 2.1593196392059326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21370293200016022, "step": 19556 }, { "epoch": 0.39116, "grad_norm": 1.9375, "grad_norm_var": 0.005631510416666667, "learning_rate": 0.0001, "loss": 3.8348, "loss/crossentropy": 1.8323914408683777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20382094383239746, "step": 19558 }, { "epoch": 0.3912, "grad_norm": 1.9375, "grad_norm_var": 0.0053059895833333336, "learning_rate": 0.0001, "loss": 4.1138, "loss/crossentropy": 2.0240999460220337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19374582171440125, "step": 19560 }, { "epoch": 0.39124, "grad_norm": 2.0, "grad_norm_var": 0.005304972330729167, "learning_rate": 0.0001, "loss": 4.4349, "loss/crossentropy": 2.377955436706543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2213267982006073, "step": 19562 }, { "epoch": 0.39128, "grad_norm": 2.015625, "grad_norm_var": 0.006154123942057292, "learning_rate": 0.0001, "loss": 4.0878, "loss/crossentropy": 1.724283754825592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20119256526231766, "step": 19564 }, { "epoch": 0.39132, "grad_norm": 1.953125, "grad_norm_var": 0.005655670166015625, "learning_rate": 0.0001, "loss": 3.9066, "loss/crossentropy": 2.0352718234062195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19105417281389236, "step": 19566 }, { "epoch": 0.39136, "grad_norm": 1.875, "grad_norm_var": 0.0037913004557291667, "learning_rate": 0.0001, "loss": 3.7868, "loss/crossentropy": 2.078265905380249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19992397725582123, "step": 19568 }, { "epoch": 0.3914, "grad_norm": 1.859375, "grad_norm_var": 0.004329172770182291, "learning_rate": 0.0001, "loss": 3.8276, "loss/crossentropy": 1.9722577929496765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17697854340076447, "step": 19570 }, { "epoch": 0.39144, "grad_norm": 1.9453125, "grad_norm_var": 0.004915364583333333, "learning_rate": 0.0001, "loss": 4.0587, "loss/crossentropy": 2.2238826751708984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20041973888874054, "step": 19572 }, { "epoch": 0.39148, "grad_norm": 1.9140625, "grad_norm_var": 0.0037923177083333333, "learning_rate": 0.0001, "loss": 4.0226, "loss/crossentropy": 1.8374757170677185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18086174875497818, "step": 19574 }, { "epoch": 0.39152, "grad_norm": 2.140625, "grad_norm_var": 0.00635986328125, "learning_rate": 0.0001, "loss": 4.2903, "loss/crossentropy": 2.0642590522766113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19544780254364014, "step": 19576 }, { "epoch": 0.39156, "grad_norm": 2.421875, "grad_norm_var": 0.020493316650390624, "learning_rate": 0.0001, "loss": 4.1231, "loss/crossentropy": 2.090702533721924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19859656691551208, "step": 19578 }, { "epoch": 0.3916, "grad_norm": 2.1875, "grad_norm_var": 0.023538970947265626, "learning_rate": 0.0001, "loss": 3.9075, "loss/crossentropy": 1.8087154030799866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918642893433571, "step": 19580 }, { "epoch": 0.39164, "grad_norm": 1.9140625, "grad_norm_var": 0.023527018229166665, "learning_rate": 0.0001, "loss": 3.8869, "loss/crossentropy": 2.0180618166923523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19190336763858795, "step": 19582 }, { "epoch": 0.39168, "grad_norm": 2.171875, "grad_norm_var": 0.025886027018229167, "learning_rate": 0.0001, "loss": 4.2298, "loss/crossentropy": 1.980249285697937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20591172575950623, "step": 19584 }, { "epoch": 0.39172, "grad_norm": 1.8359375, "grad_norm_var": 0.026374308268229167, "learning_rate": 0.0001, "loss": 3.9883, "loss/crossentropy": 2.1208410263061523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20714347064495087, "step": 19586 }, { "epoch": 0.39176, "grad_norm": 2.046875, "grad_norm_var": 0.024179840087890626, "learning_rate": 0.0001, "loss": 4.3101, "loss/crossentropy": 2.351140856742859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26494763791561127, "step": 19588 }, { "epoch": 0.3918, "grad_norm": 2.125, "grad_norm_var": 0.024448394775390625, "learning_rate": 0.0001, "loss": 4.1259, "loss/crossentropy": 1.9821222424507141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062402218580246, "step": 19590 }, { "epoch": 0.39184, "grad_norm": 2.015625, "grad_norm_var": 0.02462158203125, "learning_rate": 0.0001, "loss": 4.1373, "loss/crossentropy": 2.363589644432068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22435829043388367, "step": 19592 }, { "epoch": 0.39188, "grad_norm": 1.984375, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0001, "loss": 4.1845, "loss/crossentropy": 1.925455391407013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17338209599256516, "step": 19594 }, { "epoch": 0.39192, "grad_norm": 2.109375, "grad_norm_var": 0.012254842122395833, "learning_rate": 0.0001, "loss": 3.9777, "loss/crossentropy": 2.176175117492676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21482165157794952, "step": 19596 }, { "epoch": 0.39196, "grad_norm": 1.9609375, "grad_norm_var": 0.014330037434895833, "learning_rate": 0.0001, "loss": 3.9266, "loss/crossentropy": 2.0013960003852844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20629072189331055, "step": 19598 }, { "epoch": 0.392, "grad_norm": 1.859375, "grad_norm_var": 0.011797841389973958, "learning_rate": 0.0001, "loss": 3.9402, "loss/crossentropy": 2.066355049610138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22155777364969254, "step": 19600 }, { "epoch": 0.39204, "grad_norm": 1.8359375, "grad_norm_var": 0.010809071858723958, "learning_rate": 0.0001, "loss": 4.0319, "loss/crossentropy": 2.0513144731521606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19442218542099, "step": 19602 }, { "epoch": 0.39208, "grad_norm": 2.046875, "grad_norm_var": 0.010896809895833333, "learning_rate": 0.0001, "loss": 4.2601, "loss/crossentropy": 1.9635959267616272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20168906450271606, "step": 19604 }, { "epoch": 0.39212, "grad_norm": 1.890625, "grad_norm_var": 0.009025065104166667, "learning_rate": 0.0001, "loss": 3.9406, "loss/crossentropy": 1.9395795464515686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17990338802337646, "step": 19606 }, { "epoch": 0.39216, "grad_norm": 2.125, "grad_norm_var": 0.011004384358723958, "learning_rate": 0.0001, "loss": 4.1936, "loss/crossentropy": 2.111960232257843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20662778615951538, "step": 19608 }, { "epoch": 0.3922, "grad_norm": 1.8125, "grad_norm_var": 0.011466471354166667, "learning_rate": 0.0001, "loss": 3.9238, "loss/crossentropy": 1.855182707309723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19665290415287018, "step": 19610 }, { "epoch": 0.39224, "grad_norm": 2.0, "grad_norm_var": 0.0156494140625, "learning_rate": 0.0001, "loss": 4.262, "loss/crossentropy": 2.1587076783180237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20080996304750443, "step": 19612 }, { "epoch": 0.39228, "grad_norm": 2.015625, "grad_norm_var": 0.01597874959309896, "learning_rate": 0.0001, "loss": 4.1227, "loss/crossentropy": 2.130104422569275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19484283030033112, "step": 19614 }, { "epoch": 0.39232, "grad_norm": 1.75, "grad_norm_var": 0.017992146809895835, "learning_rate": 0.0001, "loss": 3.9046, "loss/crossentropy": 1.6983461380004883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18040503561496735, "step": 19616 }, { "epoch": 0.39236, "grad_norm": 1.953125, "grad_norm_var": 0.017756144205729168, "learning_rate": 0.0001, "loss": 4.0141, "loss/crossentropy": 2.0672999024391174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20455461740493774, "step": 19618 }, { "epoch": 0.3924, "grad_norm": 1.84375, "grad_norm_var": 0.018202463785807293, "learning_rate": 0.0001, "loss": 3.83, "loss/crossentropy": 2.0494508743286133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18307264149188995, "step": 19620 }, { "epoch": 0.39244, "grad_norm": 1.875, "grad_norm_var": 0.018277740478515624, "learning_rate": 0.0001, "loss": 3.9901, "loss/crossentropy": 1.683717966079712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16951359808444977, "step": 19622 }, { "epoch": 0.39248, "grad_norm": 1.859375, "grad_norm_var": 0.015197499593098959, "learning_rate": 0.0001, "loss": 3.8383, "loss/crossentropy": 2.62760066986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22649705410003662, "step": 19624 }, { "epoch": 0.39252, "grad_norm": 2.296875, "grad_norm_var": 0.024072011311848957, "learning_rate": 0.0001, "loss": 4.19, "loss/crossentropy": 1.857836663722992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827384978532791, "step": 19626 }, { "epoch": 0.39256, "grad_norm": 2.0, "grad_norm_var": 0.016108957926432292, "learning_rate": 0.0001, "loss": 3.7635, "loss/crossentropy": 2.1203905940055847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20033711194992065, "step": 19628 }, { "epoch": 0.3926, "grad_norm": 2.125, "grad_norm_var": 0.01721165974934896, "learning_rate": 0.0001, "loss": 4.4474, "loss/crossentropy": 2.378189444541931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20892338454723358, "step": 19630 }, { "epoch": 0.39264, "grad_norm": 1.9140625, "grad_norm_var": 0.015819295247395834, "learning_rate": 0.0001, "loss": 4.0897, "loss/crossentropy": 1.896558940410614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18873175233602524, "step": 19632 }, { "epoch": 0.39268, "grad_norm": 2.109375, "grad_norm_var": 0.020798492431640624, "learning_rate": 0.0001, "loss": 4.3611, "loss/crossentropy": 2.3503568172454834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23128806054592133, "step": 19634 }, { "epoch": 0.39272, "grad_norm": 1.7578125, "grad_norm_var": 0.021345011393229165, "learning_rate": 0.0001, "loss": 4.0515, "loss/crossentropy": 2.081954002380371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20059917122125626, "step": 19636 }, { "epoch": 0.39276, "grad_norm": 2.015625, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 4.0986, "loss/crossentropy": 1.9573910236358643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903771460056305, "step": 19638 }, { "epoch": 0.3928, "grad_norm": 1.859375, "grad_norm_var": 0.022025299072265626, "learning_rate": 0.0001, "loss": 3.9333, "loss/crossentropy": 2.098384141921997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19905343651771545, "step": 19640 }, { "epoch": 0.39284, "grad_norm": 2.0, "grad_norm_var": 0.015892537434895833, "learning_rate": 0.0001, "loss": 4.0854, "loss/crossentropy": 1.8510947227478027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18955568969249725, "step": 19642 }, { "epoch": 0.39288, "grad_norm": 1.9453125, "grad_norm_var": 0.016440582275390626, "learning_rate": 0.0001, "loss": 4.2948, "loss/crossentropy": 2.2839618921279907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23230211436748505, "step": 19644 }, { "epoch": 0.39292, "grad_norm": 1.953125, "grad_norm_var": 0.014778391520182291, "learning_rate": 0.0001, "loss": 4.0112, "loss/crossentropy": 1.8055492639541626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515839755535126, "step": 19646 }, { "epoch": 0.39296, "grad_norm": 1.9765625, "grad_norm_var": 0.013348134358723958, "learning_rate": 0.0001, "loss": 4.0556, "loss/crossentropy": 1.6756115555763245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17216521501541138, "step": 19648 }, { "epoch": 0.393, "grad_norm": 1.984375, "grad_norm_var": 0.007287343343098958, "learning_rate": 0.0001, "loss": 4.4001, "loss/crossentropy": 2.3247755765914917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21122244000434875, "step": 19650 }, { "epoch": 0.39304, "grad_norm": 1.9296875, "grad_norm_var": 0.004044596354166667, "learning_rate": 0.0001, "loss": 3.895, "loss/crossentropy": 1.9654970169067383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20018797367811203, "step": 19652 }, { "epoch": 0.39308, "grad_norm": 1.96875, "grad_norm_var": 0.0029042561848958335, "learning_rate": 0.0001, "loss": 3.9018, "loss/crossentropy": 1.9368168115615845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19478464126586914, "step": 19654 }, { "epoch": 0.39312, "grad_norm": 2.03125, "grad_norm_var": 0.0023671468098958332, "learning_rate": 0.0001, "loss": 3.7869, "loss/crossentropy": 2.099206328392029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20060646533966064, "step": 19656 }, { "epoch": 0.39316, "grad_norm": 1.890625, "grad_norm_var": 0.002561187744140625, "learning_rate": 0.0001, "loss": 4.3438, "loss/crossentropy": 2.1585338711738586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20852376520633698, "step": 19658 }, { "epoch": 0.3932, "grad_norm": 1.796875, "grad_norm_var": 0.0044830322265625, "learning_rate": 0.0001, "loss": 4.0533, "loss/crossentropy": 2.285220742225647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983182057738304, "step": 19660 }, { "epoch": 0.39324, "grad_norm": 2.03125, "grad_norm_var": 0.0047190348307291664, "learning_rate": 0.0001, "loss": 4.0046, "loss/crossentropy": 1.955579936504364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21165720373392105, "step": 19662 }, { "epoch": 0.39328, "grad_norm": 1.8046875, "grad_norm_var": 0.00645751953125, "learning_rate": 0.0001, "loss": 3.823, "loss/crossentropy": 1.9925233721733093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18789846450090408, "step": 19664 }, { "epoch": 0.39332, "grad_norm": 1.984375, "grad_norm_var": 0.006490071614583333, "learning_rate": 0.0001, "loss": 4.0123, "loss/crossentropy": 1.9267281293869019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19545376300811768, "step": 19666 }, { "epoch": 0.39336, "grad_norm": 1.9453125, "grad_norm_var": 0.006493123372395834, "learning_rate": 0.0001, "loss": 4.0401, "loss/crossentropy": 2.179477632045746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19430068880319595, "step": 19668 }, { "epoch": 0.3934, "grad_norm": 2.046875, "grad_norm_var": 0.007228342692057291, "learning_rate": 0.0001, "loss": 4.1508, "loss/crossentropy": 2.1411179900169373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276124805212021, "step": 19670 }, { "epoch": 0.39344, "grad_norm": 2.3125, "grad_norm_var": 0.014438629150390625, "learning_rate": 0.0001, "loss": 4.2689, "loss/crossentropy": 2.085427463054657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21856296062469482, "step": 19672 }, { "epoch": 0.39348, "grad_norm": 2.09375, "grad_norm_var": 0.014495595296223959, "learning_rate": 0.0001, "loss": 4.0302, "loss/crossentropy": 1.7399682402610779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16918571293354034, "step": 19674 }, { "epoch": 0.39352, "grad_norm": 1.9296875, "grad_norm_var": 0.011356608072916666, "learning_rate": 0.0001, "loss": 4.0404, "loss/crossentropy": 1.8636209964752197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17654836922883987, "step": 19676 }, { "epoch": 0.39356, "grad_norm": 1.7421875, "grad_norm_var": 0.015290323893229167, "learning_rate": 0.0001, "loss": 4.0157, "loss/crossentropy": 2.143825590610504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125091552734375, "step": 19678 }, { "epoch": 0.3936, "grad_norm": 1.6953125, "grad_norm_var": 0.01873753865559896, "learning_rate": 0.0001, "loss": 3.7692, "loss/crossentropy": 1.8222747445106506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17713025212287903, "step": 19680 }, { "epoch": 0.39364, "grad_norm": 1.8515625, "grad_norm_var": 0.01969172159830729, "learning_rate": 0.0001, "loss": 3.8254, "loss/crossentropy": 1.7044820189476013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17669418454170227, "step": 19682 }, { "epoch": 0.39368, "grad_norm": 1.953125, "grad_norm_var": 0.020411936442057292, "learning_rate": 0.0001, "loss": 4.0987, "loss/crossentropy": 2.144485831260681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20580272376537323, "step": 19684 }, { "epoch": 0.39372, "grad_norm": 1.984375, "grad_norm_var": 0.019774373372395834, "learning_rate": 0.0001, "loss": 4.0994, "loss/crossentropy": 2.17536997795105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21704821288585663, "step": 19686 }, { "epoch": 0.39376, "grad_norm": 1.9453125, "grad_norm_var": 0.012963612874348959, "learning_rate": 0.0001, "loss": 4.2592, "loss/crossentropy": 2.1875303983688354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996757537126541, "step": 19688 }, { "epoch": 0.3938, "grad_norm": 2.125, "grad_norm_var": 0.014121246337890626, "learning_rate": 0.0001, "loss": 4.1943, "loss/crossentropy": 2.270558476448059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23406681418418884, "step": 19690 }, { "epoch": 0.39384, "grad_norm": 1.8671875, "grad_norm_var": 0.015860748291015626, "learning_rate": 0.0001, "loss": 4.2097, "loss/crossentropy": 2.0614060163497925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256985515356064, "step": 19692 }, { "epoch": 0.39388, "grad_norm": 1.9765625, "grad_norm_var": 0.015091705322265624, "learning_rate": 0.0001, "loss": 4.042, "loss/crossentropy": 2.1036725640296936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555094093084335, "step": 19694 }, { "epoch": 0.39392, "grad_norm": 1.8828125, "grad_norm_var": 0.010396321614583334, "learning_rate": 0.0001, "loss": 3.7702, "loss/crossentropy": 1.8713775277137756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973343789577484, "step": 19696 }, { "epoch": 0.39396, "grad_norm": 1.921875, "grad_norm_var": 0.009411366780598958, "learning_rate": 0.0001, "loss": 3.9581, "loss/crossentropy": 1.9986143708229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925104334950447, "step": 19698 }, { "epoch": 0.394, "grad_norm": 1.9921875, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 3.8134, "loss/crossentropy": 1.8336694836616516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18453969806432724, "step": 19700 }, { "epoch": 0.39404, "grad_norm": 1.9375, "grad_norm_var": 0.01065673828125, "learning_rate": 0.0001, "loss": 3.9279, "loss/crossentropy": 1.8948233723640442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17115242034196854, "step": 19702 }, { "epoch": 0.39408, "grad_norm": 1.984375, "grad_norm_var": 0.008958943684895833, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 2.5506935119628906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22056522965431213, "step": 19704 }, { "epoch": 0.39412, "grad_norm": 1.984375, "grad_norm_var": 0.02890625, "learning_rate": 0.0001, "loss": 4.1832, "loss/crossentropy": 2.127421021461487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20175430178642273, "step": 19706 }, { "epoch": 0.39416, "grad_norm": 1.9765625, "grad_norm_var": 0.027197265625, "learning_rate": 0.0001, "loss": 4.0251, "loss/crossentropy": 2.246693968772888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22178302705287933, "step": 19708 }, { "epoch": 0.3942, "grad_norm": 1.6953125, "grad_norm_var": 0.03328221638997396, "learning_rate": 0.0001, "loss": 3.9336, "loss/crossentropy": 1.970005750656128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18967190384864807, "step": 19710 }, { "epoch": 0.39424, "grad_norm": 1.921875, "grad_norm_var": 0.032956695556640624, "learning_rate": 0.0001, "loss": 4.1035, "loss/crossentropy": 1.944337785243988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952769234776497, "step": 19712 }, { "epoch": 0.39428, "grad_norm": 1.9453125, "grad_norm_var": 0.0328277587890625, "learning_rate": 0.0001, "loss": 4.2289, "loss/crossentropy": 2.291213870048523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077646553516388, "step": 19714 }, { "epoch": 0.39432, "grad_norm": 2.015625, "grad_norm_var": 0.03361790974934896, "learning_rate": 0.0001, "loss": 4.0433, "loss/crossentropy": 2.404030203819275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200116366147995, "step": 19716 }, { "epoch": 0.39436, "grad_norm": 2.109375, "grad_norm_var": 0.032364908854166666, "learning_rate": 0.0001, "loss": 3.8609, "loss/crossentropy": 1.9242625832557678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19318503141403198, "step": 19718 }, { "epoch": 0.3944, "grad_norm": 1.890625, "grad_norm_var": 0.03240966796875, "learning_rate": 0.0001, "loss": 3.9789, "loss/crossentropy": 2.319575071334839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21668671071529388, "step": 19720 }, { "epoch": 0.39444, "grad_norm": 1.9921875, "grad_norm_var": 0.012287394205729166, "learning_rate": 0.0001, "loss": 4.2219, "loss/crossentropy": 2.0032835006713867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19786667823791504, "step": 19722 }, { "epoch": 0.39448, "grad_norm": 1.7734375, "grad_norm_var": 0.0146636962890625, "learning_rate": 0.0001, "loss": 3.9509, "loss/crossentropy": 1.9524562358856201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18965402245521545, "step": 19724 }, { "epoch": 0.39452, "grad_norm": 1.890625, "grad_norm_var": 0.007521311442057292, "learning_rate": 0.0001, "loss": 3.5677, "loss/crossentropy": 1.5887231826782227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1512351706624031, "step": 19726 }, { "epoch": 0.39456, "grad_norm": 2.046875, "grad_norm_var": 0.008576456705729167, "learning_rate": 0.0001, "loss": 4.1379, "loss/crossentropy": 2.107685923576355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18645642697811127, "step": 19728 }, { "epoch": 0.3946, "grad_norm": 1.8828125, "grad_norm_var": 0.009028879801432292, "learning_rate": 0.0001, "loss": 3.9594, "loss/crossentropy": 2.3551766872406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207053080201149, "step": 19730 }, { "epoch": 0.39464, "grad_norm": 2.0625, "grad_norm_var": 0.010847727457682291, "learning_rate": 0.0001, "loss": 4.3542, "loss/crossentropy": 2.1983554363250732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23001667857170105, "step": 19732 }, { "epoch": 0.39468, "grad_norm": 2.03125, "grad_norm_var": 0.009639485677083334, "learning_rate": 0.0001, "loss": 4.3158, "loss/crossentropy": 2.0236815214157104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230765849351883, "step": 19734 }, { "epoch": 0.39472, "grad_norm": 1.9921875, "grad_norm_var": 0.010529581705729167, "learning_rate": 0.0001, "loss": 4.0511, "loss/crossentropy": 2.2027645111083984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1864710971713066, "step": 19736 }, { "epoch": 0.39476, "grad_norm": 2.15625, "grad_norm_var": 0.013152821858723959, "learning_rate": 0.0001, "loss": 4.008, "loss/crossentropy": 1.921549379825592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21788031607866287, "step": 19738 }, { "epoch": 0.3948, "grad_norm": 1.953125, "grad_norm_var": 0.010416666666666666, "learning_rate": 0.0001, "loss": 3.8353, "loss/crossentropy": 2.121790587902069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21380367130041122, "step": 19740 }, { "epoch": 0.39484, "grad_norm": 1.9140625, "grad_norm_var": 0.008217112223307291, "learning_rate": 0.0001, "loss": 3.9975, "loss/crossentropy": 1.9595564007759094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17919503152370453, "step": 19742 }, { "epoch": 0.39488, "grad_norm": 2.140625, "grad_norm_var": 0.010993448893229167, "learning_rate": 0.0001, "loss": 3.9935, "loss/crossentropy": 2.155984342098236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002984657883644, "step": 19744 }, { "epoch": 0.39492, "grad_norm": 1.7890625, "grad_norm_var": 0.012589518229166667, "learning_rate": 0.0001, "loss": 3.875, "loss/crossentropy": 1.766005277633667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1717890352010727, "step": 19746 }, { "epoch": 0.39496, "grad_norm": 2.015625, "grad_norm_var": 0.010424550374348958, "learning_rate": 0.0001, "loss": 3.9178, "loss/crossentropy": 1.8891428112983704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1965286061167717, "step": 19748 }, { "epoch": 0.395, "grad_norm": 1.9921875, "grad_norm_var": 0.010343170166015625, "learning_rate": 0.0001, "loss": 4.3124, "loss/crossentropy": 1.9521282315254211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21058151125907898, "step": 19750 }, { "epoch": 0.39504, "grad_norm": 1.8515625, "grad_norm_var": 0.014711252848307292, "learning_rate": 0.0001, "loss": 4.2242, "loss/crossentropy": 2.3142699003219604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24145027250051498, "step": 19752 }, { "epoch": 0.39508, "grad_norm": 1.9375, "grad_norm_var": 0.011777496337890625, "learning_rate": 0.0001, "loss": 4.1059, "loss/crossentropy": 2.0375224351882935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19370558112859726, "step": 19754 }, { "epoch": 0.39512, "grad_norm": 1.9140625, "grad_norm_var": 0.011962890625, "learning_rate": 0.0001, "loss": 4.2367, "loss/crossentropy": 2.193490743637085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012534961104393, "step": 19756 }, { "epoch": 0.39516, "grad_norm": 2.015625, "grad_norm_var": 0.011993153889973959, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 2.2397992610931396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19738413393497467, "step": 19758 }, { "epoch": 0.3952, "grad_norm": 1.90625, "grad_norm_var": 0.010223134358723959, "learning_rate": 0.0001, "loss": 3.9153, "loss/crossentropy": 1.7602161169052124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18599014729261398, "step": 19760 }, { "epoch": 0.39524, "grad_norm": 1.9296875, "grad_norm_var": 0.008640289306640625, "learning_rate": 0.0001, "loss": 4.182, "loss/crossentropy": 2.536779046058655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2301289290189743, "step": 19762 }, { "epoch": 0.39528, "grad_norm": 1.8984375, "grad_norm_var": 0.008965810139973959, "learning_rate": 0.0001, "loss": 3.9999, "loss/crossentropy": 2.3237764835357666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21476060152053833, "step": 19764 }, { "epoch": 0.39532, "grad_norm": 2.0625, "grad_norm_var": 0.011107381184895833, "learning_rate": 0.0001, "loss": 4.2102, "loss/crossentropy": 2.370723605155945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24277979880571365, "step": 19766 }, { "epoch": 0.39536, "grad_norm": 2.09375, "grad_norm_var": 0.006917063395182292, "learning_rate": 0.0001, "loss": 4.0838, "loss/crossentropy": 1.6399320363998413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1740519106388092, "step": 19768 }, { "epoch": 0.3954, "grad_norm": 1.8828125, "grad_norm_var": 0.02394383748372396, "learning_rate": 0.0001, "loss": 3.6633, "loss/crossentropy": 1.8633801341056824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810051053762436, "step": 19770 }, { "epoch": 0.39544, "grad_norm": 1.953125, "grad_norm_var": 0.023583984375, "learning_rate": 0.0001, "loss": 4.1102, "loss/crossentropy": 2.318315625190735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21214767545461655, "step": 19772 }, { "epoch": 0.39548, "grad_norm": 1.9921875, "grad_norm_var": 0.026387532552083332, "learning_rate": 0.0001, "loss": 4.1232, "loss/crossentropy": 2.2803520560264587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21136894822120667, "step": 19774 }, { "epoch": 0.39552, "grad_norm": 1.8359375, "grad_norm_var": 0.026387532552083332, "learning_rate": 0.0001, "loss": 3.9034, "loss/crossentropy": 2.1529780626296997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18172463029623032, "step": 19776 }, { "epoch": 0.39556, "grad_norm": 2.015625, "grad_norm_var": 0.025585683186848958, "learning_rate": 0.0001, "loss": 4.2851, "loss/crossentropy": 2.1246083974838257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181987464427948, "step": 19778 }, { "epoch": 0.3956, "grad_norm": 1.90625, "grad_norm_var": 0.029515584309895832, "learning_rate": 0.0001, "loss": 4.0097, "loss/crossentropy": 2.027298629283905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877325028181076, "step": 19780 }, { "epoch": 0.39564, "grad_norm": 2.0, "grad_norm_var": 0.0288482666015625, "learning_rate": 0.0001, "loss": 4.2326, "loss/crossentropy": 1.9338072538375854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1746418997645378, "step": 19782 }, { "epoch": 0.39568, "grad_norm": 2.1875, "grad_norm_var": 0.03104222615559896, "learning_rate": 0.0001, "loss": 4.2551, "loss/crossentropy": 2.050130307674408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011866271495819, "step": 19784 }, { "epoch": 0.39572, "grad_norm": 1.9765625, "grad_norm_var": 0.014720662434895834, "learning_rate": 0.0001, "loss": 4.3227, "loss/crossentropy": 2.48315691947937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591856867074966, "step": 19786 }, { "epoch": 0.39576, "grad_norm": 2.046875, "grad_norm_var": 0.021019490559895833, "learning_rate": 0.0001, "loss": 4.3671, "loss/crossentropy": 2.578279137611389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216324657201767, "step": 19788 }, { "epoch": 0.3958, "grad_norm": 1.8828125, "grad_norm_var": 0.018192291259765625, "learning_rate": 0.0001, "loss": 4.259, "loss/crossentropy": 1.7862395644187927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17009516060352325, "step": 19790 }, { "epoch": 0.39584, "grad_norm": 1.8828125, "grad_norm_var": 0.017032877604166666, "learning_rate": 0.0001, "loss": 3.8506, "loss/crossentropy": 2.042613208293915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1844482272863388, "step": 19792 }, { "epoch": 0.39588, "grad_norm": 1.7890625, "grad_norm_var": 0.020926666259765626, "learning_rate": 0.0001, "loss": 4.1919, "loss/crossentropy": 2.317251443862915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2135745733976364, "step": 19794 }, { "epoch": 0.39592, "grad_norm": 2.109375, "grad_norm_var": 0.01964111328125, "learning_rate": 0.0001, "loss": 4.0061, "loss/crossentropy": 1.849199891090393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20549577474594116, "step": 19796 }, { "epoch": 0.39596, "grad_norm": 2.015625, "grad_norm_var": 0.0199859619140625, "learning_rate": 0.0001, "loss": 4.0551, "loss/crossentropy": 2.1876507997512817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20098386704921722, "step": 19798 }, { "epoch": 0.396, "grad_norm": 1.9921875, "grad_norm_var": 0.01718317667643229, "learning_rate": 0.0001, "loss": 4.154, "loss/crossentropy": 1.9740530848503113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19494586437940598, "step": 19800 }, { "epoch": 0.39604, "grad_norm": 1.9375, "grad_norm_var": 0.017577107747395834, "learning_rate": 0.0001, "loss": 4.1405, "loss/crossentropy": 2.208943724632263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22833283245563507, "step": 19802 }, { "epoch": 0.39608, "grad_norm": 1.96875, "grad_norm_var": 0.009488677978515625, "learning_rate": 0.0001, "loss": 3.9082, "loss/crossentropy": 2.024399518966675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19786083698272705, "step": 19804 }, { "epoch": 0.39612, "grad_norm": 2.03125, "grad_norm_var": 0.009356435139973958, "learning_rate": 0.0001, "loss": 4.1475, "loss/crossentropy": 2.1283441185951233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21942409127950668, "step": 19806 }, { "epoch": 0.39616, "grad_norm": 1.953125, "grad_norm_var": 0.01080322265625, "learning_rate": 0.0001, "loss": 4.0101, "loss/crossentropy": 2.3675626516342163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22466859221458435, "step": 19808 }, { "epoch": 0.3962, "grad_norm": 2.078125, "grad_norm_var": 0.0082672119140625, "learning_rate": 0.0001, "loss": 3.8682, "loss/crossentropy": 1.886322796344757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1904991939663887, "step": 19810 }, { "epoch": 0.39624, "grad_norm": 2.03125, "grad_norm_var": 0.005012003580729166, "learning_rate": 0.0001, "loss": 3.9071, "loss/crossentropy": 2.098154127597809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2065964937210083, "step": 19812 }, { "epoch": 0.39628, "grad_norm": 1.8515625, "grad_norm_var": 0.006788889567057292, "learning_rate": 0.0001, "loss": 3.8447, "loss/crossentropy": 2.0527199506759644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19717100262641907, "step": 19814 }, { "epoch": 0.39632, "grad_norm": 2.03125, "grad_norm_var": 0.011244455973307291, "learning_rate": 0.0001, "loss": 4.2692, "loss/crossentropy": 2.0650912523269653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2021411582827568, "step": 19816 }, { "epoch": 0.39636, "grad_norm": 2.34375, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 4.3611, "loss/crossentropy": 2.2239702939987183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19744005054235458, "step": 19818 }, { "epoch": 0.3964, "grad_norm": 1.9375, "grad_norm_var": 0.020369211832682293, "learning_rate": 0.0001, "loss": 3.9029, "loss/crossentropy": 2.123336434364319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18976306170225143, "step": 19820 }, { "epoch": 0.39644, "grad_norm": 1.9296875, "grad_norm_var": 0.02154515584309896, "learning_rate": 0.0001, "loss": 3.9966, "loss/crossentropy": 2.121657133102417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159118503332138, "step": 19822 }, { "epoch": 0.39648, "grad_norm": 2.078125, "grad_norm_var": 0.0185455322265625, "learning_rate": 0.0001, "loss": 3.7621, "loss/crossentropy": 1.7323416471481323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18072299659252167, "step": 19824 }, { "epoch": 0.39652, "grad_norm": 2.21875, "grad_norm_var": 0.020243072509765626, "learning_rate": 0.0001, "loss": 4.0183, "loss/crossentropy": 2.181841015815735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21452204138040543, "step": 19826 }, { "epoch": 0.39656, "grad_norm": 1.9296875, "grad_norm_var": 0.02104670206705729, "learning_rate": 0.0001, "loss": 4.1181, "loss/crossentropy": 1.9022215008735657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18924792110919952, "step": 19828 }, { "epoch": 0.3966, "grad_norm": 1.984375, "grad_norm_var": 0.018317667643229167, "learning_rate": 0.0001, "loss": 4.2505, "loss/crossentropy": 2.1240022778511047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18618234246969223, "step": 19830 }, { "epoch": 0.39664, "grad_norm": 1.9921875, "grad_norm_var": 0.015710194905598957, "learning_rate": 0.0001, "loss": 3.8611, "loss/crossentropy": 1.5959683060646057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17317308485507965, "step": 19832 }, { "epoch": 0.39668, "grad_norm": 1.9765625, "grad_norm_var": 0.007682291666666666, "learning_rate": 0.0001, "loss": 4.1281, "loss/crossentropy": 2.1812866926193237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192174032330513, "step": 19834 }, { "epoch": 0.39672, "grad_norm": 1.953125, "grad_norm_var": 0.010651652018229167, "learning_rate": 0.0001, "loss": 4.188, "loss/crossentropy": 1.8964568972587585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20922592282295227, "step": 19836 }, { "epoch": 0.39676, "grad_norm": 1.8359375, "grad_norm_var": 0.012013498942057292, "learning_rate": 0.0001, "loss": 4.1369, "loss/crossentropy": 2.0822665691375732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18019618093967438, "step": 19838 }, { "epoch": 0.3968, "grad_norm": 1.9453125, "grad_norm_var": 0.010985310872395833, "learning_rate": 0.0001, "loss": 4.0271, "loss/crossentropy": 2.116270899772644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21604043990373611, "step": 19840 }, { "epoch": 0.39684, "grad_norm": 1.9609375, "grad_norm_var": 0.007759602864583334, "learning_rate": 0.0001, "loss": 3.9132, "loss/crossentropy": 1.7761988639831543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20087965577840805, "step": 19842 }, { "epoch": 0.39688, "grad_norm": 1.8984375, "grad_norm_var": 0.008649698893229167, "learning_rate": 0.0001, "loss": 4.0187, "loss/crossentropy": 1.6091360449790955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18282026052474976, "step": 19844 }, { "epoch": 0.39692, "grad_norm": 1.8828125, "grad_norm_var": 0.009547678629557292, "learning_rate": 0.0001, "loss": 4.4441, "loss/crossentropy": 2.6394678354263306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20579150319099426, "step": 19846 }, { "epoch": 0.39696, "grad_norm": 2.0, "grad_norm_var": 0.010011545817057292, "learning_rate": 0.0001, "loss": 4.1778, "loss/crossentropy": 2.0911704897880554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20075388252735138, "step": 19848 }, { "epoch": 0.397, "grad_norm": 1.9765625, "grad_norm_var": 0.010139719645182291, "learning_rate": 0.0001, "loss": 4.3458, "loss/crossentropy": 2.1299456357955933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067631408572197, "step": 19850 }, { "epoch": 0.39704, "grad_norm": 1.875, "grad_norm_var": 0.004937489827473958, "learning_rate": 0.0001, "loss": 3.9037, "loss/crossentropy": 1.9630563855171204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187811940908432, "step": 19852 }, { "epoch": 0.39708, "grad_norm": 1.90625, "grad_norm_var": 0.004808553059895833, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.1914453506469727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064526304602623, "step": 19854 }, { "epoch": 0.39712, "grad_norm": 2.046875, "grad_norm_var": 0.012412261962890626, "learning_rate": 0.0001, "loss": 4.1404, "loss/crossentropy": 2.3003474473953247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20841488242149353, "step": 19856 }, { "epoch": 0.39716, "grad_norm": 1.84375, "grad_norm_var": 0.01682306925455729, "learning_rate": 0.0001, "loss": 3.96, "loss/crossentropy": 2.1625255346298218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20437289774417877, "step": 19858 }, { "epoch": 0.3972, "grad_norm": 1.9765625, "grad_norm_var": 0.015868123372395834, "learning_rate": 0.0001, "loss": 3.8594, "loss/crossentropy": 1.6742416620254517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16787201166152954, "step": 19860 }, { "epoch": 0.39724, "grad_norm": 1.921875, "grad_norm_var": 0.01546630859375, "learning_rate": 0.0001, "loss": 3.8889, "loss/crossentropy": 2.0682146549224854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1715894490480423, "step": 19862 }, { "epoch": 0.39728, "grad_norm": 1.96875, "grad_norm_var": 0.017071278889973958, "learning_rate": 0.0001, "loss": 4.0319, "loss/crossentropy": 1.99592924118042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19570383429527283, "step": 19864 }, { "epoch": 0.39732, "grad_norm": 1.9921875, "grad_norm_var": 0.01967137654622396, "learning_rate": 0.0001, "loss": 4.0123, "loss/crossentropy": 2.086844265460968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123936414718628, "step": 19866 }, { "epoch": 0.39736, "grad_norm": 1.9921875, "grad_norm_var": 0.017256673177083334, "learning_rate": 0.0001, "loss": 4.2245, "loss/crossentropy": 1.7985658645629883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18117651343345642, "step": 19868 }, { "epoch": 0.3974, "grad_norm": 1.9140625, "grad_norm_var": 0.018949381510416665, "learning_rate": 0.0001, "loss": 3.8449, "loss/crossentropy": 1.9804646372795105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18682067096233368, "step": 19870 }, { "epoch": 0.39744, "grad_norm": 1.796875, "grad_norm_var": 0.01546630859375, "learning_rate": 0.0001, "loss": 4.0692, "loss/crossentropy": 1.9027678966522217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18195898830890656, "step": 19872 }, { "epoch": 0.39748, "grad_norm": 1.9296875, "grad_norm_var": 0.01660944620768229, "learning_rate": 0.0001, "loss": 4.1191, "loss/crossentropy": 2.3370686769485474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2376203015446663, "step": 19874 }, { "epoch": 0.39752, "grad_norm": 1.890625, "grad_norm_var": 0.018363189697265626, "learning_rate": 0.0001, "loss": 3.8472, "loss/crossentropy": 1.7463279366493225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16485755145549774, "step": 19876 }, { "epoch": 0.39756, "grad_norm": 2.015625, "grad_norm_var": 0.017967732747395833, "learning_rate": 0.0001, "loss": 4.4039, "loss/crossentropy": 2.1171544194221497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20631500333547592, "step": 19878 }, { "epoch": 0.3976, "grad_norm": 1.84375, "grad_norm_var": 0.016733551025390626, "learning_rate": 0.0001, "loss": 3.7242, "loss/crossentropy": 2.1109176874160767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18512246757745743, "step": 19880 }, { "epoch": 0.39764, "grad_norm": 1.9375, "grad_norm_var": 0.013618977864583333, "learning_rate": 0.0001, "loss": 3.6843, "loss/crossentropy": 1.927691638469696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19118069112300873, "step": 19882 }, { "epoch": 0.39768, "grad_norm": 1.9921875, "grad_norm_var": 0.01337890625, "learning_rate": 0.0001, "loss": 3.9539, "loss/crossentropy": 2.041896104812622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21281379461288452, "step": 19884 }, { "epoch": 0.39772, "grad_norm": 1.7890625, "grad_norm_var": 0.013272857666015625, "learning_rate": 0.0001, "loss": 3.9146, "loss/crossentropy": 1.8779407739639282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.180915005505085, "step": 19886 }, { "epoch": 0.39776, "grad_norm": 1.9296875, "grad_norm_var": 0.019001261393229166, "learning_rate": 0.0001, "loss": 4.2158, "loss/crossentropy": 1.9899646639823914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19102467596530914, "step": 19888 }, { "epoch": 0.3978, "grad_norm": 2.0, "grad_norm_var": 0.012562815348307292, "learning_rate": 0.0001, "loss": 4.1822, "loss/crossentropy": 2.3294299840927124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20739557594060898, "step": 19890 }, { "epoch": 0.39784, "grad_norm": 1.9921875, "grad_norm_var": 0.011864980061848959, "learning_rate": 0.0001, "loss": 4.2829, "loss/crossentropy": 2.2126539945602417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066286742687225, "step": 19892 }, { "epoch": 0.39788, "grad_norm": 2.1875, "grad_norm_var": 0.016454060872395832, "learning_rate": 0.0001, "loss": 3.8665, "loss/crossentropy": 1.9949330687522888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19172564148902893, "step": 19894 }, { "epoch": 0.39792, "grad_norm": 2.0625, "grad_norm_var": 0.02158177693684896, "learning_rate": 0.0001, "loss": 3.9567, "loss/crossentropy": 2.0896310210227966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19692128896713257, "step": 19896 }, { "epoch": 0.39796, "grad_norm": 1.921875, "grad_norm_var": 0.020310211181640624, "learning_rate": 0.0001, "loss": 4.1856, "loss/crossentropy": 2.191789746284485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073511779308319, "step": 19898 }, { "epoch": 0.398, "grad_norm": 1.921875, "grad_norm_var": 0.022415924072265624, "learning_rate": 0.0001, "loss": 3.9965, "loss/crossentropy": 1.9925153255462646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20123805850744247, "step": 19900 }, { "epoch": 0.39804, "grad_norm": 1.8125, "grad_norm_var": 0.021476236979166667, "learning_rate": 0.0001, "loss": 3.7917, "loss/crossentropy": 1.8903232216835022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18577788770198822, "step": 19902 }, { "epoch": 0.39808, "grad_norm": 2.078125, "grad_norm_var": 0.01817804972330729, "learning_rate": 0.0001, "loss": 4.2818, "loss/crossentropy": 2.055288314819336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19745147973299026, "step": 19904 }, { "epoch": 0.39812, "grad_norm": 2.0625, "grad_norm_var": 0.018570709228515624, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.7709746360778809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17147859930992126, "step": 19906 }, { "epoch": 0.39816, "grad_norm": 2.0, "grad_norm_var": 0.01843846638997396, "learning_rate": 0.0001, "loss": 4.182, "loss/crossentropy": 2.3295364379882812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20680927485227585, "step": 19908 }, { "epoch": 0.3982, "grad_norm": 1.9765625, "grad_norm_var": 0.014414215087890625, "learning_rate": 0.0001, "loss": 4.1633, "loss/crossentropy": 1.9828099608421326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202943354845047, "step": 19910 }, { "epoch": 0.39824, "grad_norm": 2.046875, "grad_norm_var": 0.009352366129557291, "learning_rate": 0.0001, "loss": 4.2294, "loss/crossentropy": 2.140046715736389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1967788189649582, "step": 19912 }, { "epoch": 0.39828, "grad_norm": 1.9453125, "grad_norm_var": 0.008736165364583333, "learning_rate": 0.0001, "loss": 4.1673, "loss/crossentropy": 2.3365899324417114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2261614426970482, "step": 19914 }, { "epoch": 0.39832, "grad_norm": 1.9921875, "grad_norm_var": 0.0075642903645833336, "learning_rate": 0.0001, "loss": 3.8716, "loss/crossentropy": 2.101326584815979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19668899476528168, "step": 19916 }, { "epoch": 0.39836, "grad_norm": 1.96875, "grad_norm_var": 0.005037180582682292, "learning_rate": 0.0001, "loss": 4.1572, "loss/crossentropy": 1.9804238080978394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117360234260559, "step": 19918 }, { "epoch": 0.3984, "grad_norm": 1.9140625, "grad_norm_var": 0.0037676493326822915, "learning_rate": 0.0001, "loss": 3.945, "loss/crossentropy": 1.858399510383606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20013604313135147, "step": 19920 }, { "epoch": 0.39844, "grad_norm": 1.90625, "grad_norm_var": 0.005832672119140625, "learning_rate": 0.0001, "loss": 4.0302, "loss/crossentropy": 2.087529957294464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19001878052949905, "step": 19922 }, { "epoch": 0.39848, "grad_norm": 1.8359375, "grad_norm_var": 0.0068267822265625, "learning_rate": 0.0001, "loss": 3.9425, "loss/crossentropy": 2.2521530389785767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20182852447032928, "step": 19924 }, { "epoch": 0.39852, "grad_norm": 2.109375, "grad_norm_var": 0.00784912109375, "learning_rate": 0.0001, "loss": 4.3362, "loss/crossentropy": 2.370557188987732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27234383672475815, "step": 19926 }, { "epoch": 0.39856, "grad_norm": 13.0625, "grad_norm_var": 7.6871192932128904, "learning_rate": 0.0001, "loss": 4.0383, "loss/crossentropy": 2.18235445022583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20694056898355484, "step": 19928 }, { "epoch": 0.3986, "grad_norm": 2.125, "grad_norm_var": 7.659780883789063, "learning_rate": 0.0001, "loss": 3.8946, "loss/crossentropy": 1.9220272898674011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21540776640176773, "step": 19930 }, { "epoch": 0.39864, "grad_norm": 2.015625, "grad_norm_var": 7.6551513671875, "learning_rate": 0.0001, "loss": 4.015, "loss/crossentropy": 2.064575970172882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20172829926013947, "step": 19932 }, { "epoch": 0.39868, "grad_norm": 2.296875, "grad_norm_var": 7.620402018229167, "learning_rate": 0.0001, "loss": 4.5782, "loss/crossentropy": 2.1824090480804443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23097260296344757, "step": 19934 }, { "epoch": 0.39872, "grad_norm": 2.0, "grad_norm_var": 7.594489542643229, "learning_rate": 0.0001, "loss": 3.9227, "loss/crossentropy": 2.211961567401886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150636538863182, "step": 19936 }, { "epoch": 0.39876, "grad_norm": 2.4375, "grad_norm_var": 7.586128743489583, "learning_rate": 0.0001, "loss": 4.1724, "loss/crossentropy": 1.8789254426956177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18855369836091995, "step": 19938 }, { "epoch": 0.3988, "grad_norm": 2.03125, "grad_norm_var": 7.576968383789063, "learning_rate": 0.0001, "loss": 4.212, "loss/crossentropy": 2.322842240333557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19432100653648376, "step": 19940 }, { "epoch": 0.39884, "grad_norm": 1.9375, "grad_norm_var": 7.590311686197917, "learning_rate": 0.0001, "loss": 4.2201, "loss/crossentropy": 2.1524535417556763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19931253790855408, "step": 19942 }, { "epoch": 0.39888, "grad_norm": 1.8359375, "grad_norm_var": 0.0280670166015625, "learning_rate": 0.0001, "loss": 4.0166, "loss/crossentropy": 2.0835599303245544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18674355000257492, "step": 19944 }, { "epoch": 0.39892, "grad_norm": 1.96875, "grad_norm_var": 0.026718902587890624, "learning_rate": 0.0001, "loss": 3.9398, "loss/crossentropy": 1.8204763531684875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970055252313614, "step": 19946 }, { "epoch": 0.39896, "grad_norm": 1.9375, "grad_norm_var": 0.02769953409830729, "learning_rate": 0.0001, "loss": 3.8403, "loss/crossentropy": 2.1432100534439087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960032731294632, "step": 19948 }, { "epoch": 0.399, "grad_norm": 1.9296875, "grad_norm_var": 0.0227294921875, "learning_rate": 0.0001, "loss": 4.1183, "loss/crossentropy": 1.9959533214569092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18608924746513367, "step": 19950 }, { "epoch": 0.39904, "grad_norm": 2.015625, "grad_norm_var": 0.02399266560872396, "learning_rate": 0.0001, "loss": 4.0235, "loss/crossentropy": 1.9874342679977417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18600600212812424, "step": 19952 }, { "epoch": 0.39908, "grad_norm": 1.9296875, "grad_norm_var": 0.00849609375, "learning_rate": 0.0001, "loss": 4.2266, "loss/crossentropy": 2.5082361698150635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22386416047811508, "step": 19954 }, { "epoch": 0.39912, "grad_norm": 1.90625, "grad_norm_var": 0.008153279622395834, "learning_rate": 0.0001, "loss": 3.9735, "loss/crossentropy": 2.1550523042678833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169199213385582, "step": 19956 }, { "epoch": 0.39916, "grad_norm": 1.9609375, "grad_norm_var": 0.008172353108723959, "learning_rate": 0.0001, "loss": 3.8834, "loss/crossentropy": 2.079386830329895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19080224633216858, "step": 19958 }, { "epoch": 0.3992, "grad_norm": 1.8828125, "grad_norm_var": 0.006030019124348958, "learning_rate": 0.0001, "loss": 3.973, "loss/crossentropy": 1.8762348890304565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1813136711716652, "step": 19960 }, { "epoch": 0.39924, "grad_norm": 1.84375, "grad_norm_var": 0.006455230712890625, "learning_rate": 0.0001, "loss": 4.0965, "loss/crossentropy": 1.789110004901886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17166541516780853, "step": 19962 }, { "epoch": 0.39928, "grad_norm": 1.859375, "grad_norm_var": 0.009056599934895833, "learning_rate": 0.0001, "loss": 4.0912, "loss/crossentropy": 2.0196239948272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20005135238170624, "step": 19964 }, { "epoch": 0.39932, "grad_norm": 1.8984375, "grad_norm_var": 0.0080718994140625, "learning_rate": 0.0001, "loss": 4.2787, "loss/crossentropy": 2.192206025123596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979481726884842, "step": 19966 }, { "epoch": 0.39936, "grad_norm": 2.0625, "grad_norm_var": 0.007157135009765625, "learning_rate": 0.0001, "loss": 4.3155, "loss/crossentropy": 2.263342499732971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187722995877266, "step": 19968 }, { "epoch": 0.3994, "grad_norm": 2.109375, "grad_norm_var": 0.008161417643229167, "learning_rate": 0.0001, "loss": 4.2727, "loss/crossentropy": 2.3387625217437744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21375280618667603, "step": 19970 }, { "epoch": 0.39944, "grad_norm": 1.96875, "grad_norm_var": 0.008733876546223958, "learning_rate": 0.0001, "loss": 4.0442, "loss/crossentropy": 1.5837730765342712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17303457856178284, "step": 19972 }, { "epoch": 0.39948, "grad_norm": 1.984375, "grad_norm_var": 0.008678944905598958, "learning_rate": 0.0001, "loss": 3.8398, "loss/crossentropy": 1.9090858697891235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838761642575264, "step": 19974 }, { "epoch": 0.39952, "grad_norm": 1.875, "grad_norm_var": 0.008780924479166667, "learning_rate": 0.0001, "loss": 3.9345, "loss/crossentropy": 2.107620596885681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19132380187511444, "step": 19976 }, { "epoch": 0.39956, "grad_norm": 1.8671875, "grad_norm_var": 0.009200032552083333, "learning_rate": 0.0001, "loss": 4.0709, "loss/crossentropy": 2.1706109046936035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19275012612342834, "step": 19978 }, { "epoch": 0.3996, "grad_norm": 1.8828125, "grad_norm_var": 0.007356516520182292, "learning_rate": 0.0001, "loss": 3.8486, "loss/crossentropy": 2.0092907547950745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18047627061605453, "step": 19980 }, { "epoch": 0.39964, "grad_norm": 2.046875, "grad_norm_var": 0.025349934895833332, "learning_rate": 0.0001, "loss": 4.2226, "loss/crossentropy": 2.1216511726379395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20591023564338684, "step": 19982 }, { "epoch": 0.39968, "grad_norm": 2.046875, "grad_norm_var": 0.02671076456705729, "learning_rate": 0.0001, "loss": 3.9895, "loss/crossentropy": 2.3002817630767822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20734255760908127, "step": 19984 }, { "epoch": 0.39972, "grad_norm": 2.09375, "grad_norm_var": 0.02664972941080729, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 2.3097801208496094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20194757729768753, "step": 19986 }, { "epoch": 0.39976, "grad_norm": 2.15625, "grad_norm_var": 0.029130045572916666, "learning_rate": 0.0001, "loss": 3.9706, "loss/crossentropy": 2.2645580768585205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21866093575954437, "step": 19988 }, { "epoch": 0.3998, "grad_norm": 1.875, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 4.0634, "loss/crossentropy": 2.224185347557068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19011163711547852, "step": 19990 }, { "epoch": 0.39984, "grad_norm": 1.859375, "grad_norm_var": 0.030248006184895832, "learning_rate": 0.0001, "loss": 3.9552, "loss/crossentropy": 1.958588182926178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18359722197055817, "step": 19992 }, { "epoch": 0.39988, "grad_norm": 1.9375, "grad_norm_var": 0.029361724853515625, "learning_rate": 0.0001, "loss": 4.0449, "loss/crossentropy": 2.5081194639205933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22576630860567093, "step": 19994 }, { "epoch": 0.39992, "grad_norm": 1.8359375, "grad_norm_var": 0.0306549072265625, "learning_rate": 0.0001, "loss": 3.8833, "loss/crossentropy": 1.9870144724845886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19262682646512985, "step": 19996 }, { "epoch": 0.39996, "grad_norm": 1.828125, "grad_norm_var": 0.0118560791015625, "learning_rate": 0.0001, "loss": 4.1321, "loss/crossentropy": 2.227096438407898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056111991405487, "step": 19998 }, { "epoch": 0.4, "grad_norm": 1.796875, "grad_norm_var": 0.012837473551432292, "learning_rate": 0.0001, "loss": 4.1659, "loss/crossentropy": 1.8458907008171082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18857256323099136, "step": 20000 }, { "epoch": 0.40004, "grad_norm": 1.90625, "grad_norm_var": 0.010536448160807291, "learning_rate": 0.0001, "loss": 4.1497, "loss/crossentropy": 2.184568405151367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20918434113264084, "step": 20002 }, { "epoch": 0.40008, "grad_norm": 1.8203125, "grad_norm_var": 0.007136027018229167, "learning_rate": 0.0001, "loss": 3.8604, "loss/crossentropy": 2.046003818511963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17669742554426193, "step": 20004 }, { "epoch": 0.40012, "grad_norm": 1.8671875, "grad_norm_var": 0.006151326497395833, "learning_rate": 0.0001, "loss": 4.1618, "loss/crossentropy": 2.330680012702942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159157246351242, "step": 20006 }, { "epoch": 0.40016, "grad_norm": 1.796875, "grad_norm_var": 0.005973052978515625, "learning_rate": 0.0001, "loss": 3.643, "loss/crossentropy": 1.9141735434532166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17788738757371902, "step": 20008 }, { "epoch": 0.4002, "grad_norm": 1.9921875, "grad_norm_var": 0.006589508056640625, "learning_rate": 0.0001, "loss": 4.1782, "loss/crossentropy": 2.3000227212905884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21896196901798248, "step": 20010 }, { "epoch": 0.40024, "grad_norm": 1.9921875, "grad_norm_var": 0.0067535400390625, "learning_rate": 0.0001, "loss": 4.2057, "loss/crossentropy": 2.151344060897827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20495032519102097, "step": 20012 }, { "epoch": 0.40028, "grad_norm": 1.8828125, "grad_norm_var": 0.007911936442057291, "learning_rate": 0.0001, "loss": 3.9086, "loss/crossentropy": 2.1465260982513428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1846243441104889, "step": 20014 }, { "epoch": 0.40032, "grad_norm": 1.9375, "grad_norm_var": 0.006453450520833333, "learning_rate": 0.0001, "loss": 4.0024, "loss/crossentropy": 1.662541925907135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18223944306373596, "step": 20016 }, { "epoch": 0.40036, "grad_norm": 1.921875, "grad_norm_var": 0.006418609619140625, "learning_rate": 0.0001, "loss": 4.0052, "loss/crossentropy": 1.9482674598693848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18713432550430298, "step": 20018 }, { "epoch": 0.4004, "grad_norm": 1.9296875, "grad_norm_var": 0.0058837890625, "learning_rate": 0.0001, "loss": 3.7834, "loss/crossentropy": 1.7436261773109436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18203437328338623, "step": 20020 }, { "epoch": 0.40044, "grad_norm": 1.9609375, "grad_norm_var": 0.005716705322265625, "learning_rate": 0.0001, "loss": 3.8905, "loss/crossentropy": 2.0031047463417053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20406989008188248, "step": 20022 }, { "epoch": 0.40048, "grad_norm": 2.03125, "grad_norm_var": 0.005415852864583333, "learning_rate": 0.0001, "loss": 3.8527, "loss/crossentropy": 1.9428189992904663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17989446222782135, "step": 20024 }, { "epoch": 0.40052, "grad_norm": 1.8984375, "grad_norm_var": 0.005208333333333333, "learning_rate": 0.0001, "loss": 3.9674, "loss/crossentropy": 2.021029829978943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1908131018280983, "step": 20026 }, { "epoch": 0.40056, "grad_norm": 2.0, "grad_norm_var": 0.02068049112955729, "learning_rate": 0.0001, "loss": 4.3471, "loss/crossentropy": 1.8795804381370544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17839516699314117, "step": 20028 }, { "epoch": 0.4006, "grad_norm": 2.296875, "grad_norm_var": 0.022823079427083334, "learning_rate": 0.0001, "loss": 4.4826, "loss/crossentropy": 1.9421144723892212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18655938655138016, "step": 20030 }, { "epoch": 0.40064, "grad_norm": 2.0625, "grad_norm_var": 0.022809855143229165, "learning_rate": 0.0001, "loss": 3.9615, "loss/crossentropy": 2.1828919649124146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21237139403820038, "step": 20032 }, { "epoch": 0.40068, "grad_norm": 1.984375, "grad_norm_var": 0.02103271484375, "learning_rate": 0.0001, "loss": 4.0375, "loss/crossentropy": 1.9509565830230713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19763849675655365, "step": 20034 }, { "epoch": 0.40072, "grad_norm": 1.84375, "grad_norm_var": 0.022507476806640624, "learning_rate": 0.0001, "loss": 4.0011, "loss/crossentropy": 2.1327446699142456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048783376812935, "step": 20036 }, { "epoch": 0.40076, "grad_norm": 2.140625, "grad_norm_var": 0.022904205322265624, "learning_rate": 0.0001, "loss": 4.2094, "loss/crossentropy": 2.3295645713806152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22039268165826797, "step": 20038 }, { "epoch": 0.4008, "grad_norm": 2.203125, "grad_norm_var": 0.03260879516601563, "learning_rate": 0.0001, "loss": 3.7464, "loss/crossentropy": 1.786357820034027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1681244671344757, "step": 20040 }, { "epoch": 0.40084, "grad_norm": 2.140625, "grad_norm_var": 0.033113352457682294, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 2.0342337489128113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19485876709222794, "step": 20042 }, { "epoch": 0.40088, "grad_norm": 1.96875, "grad_norm_var": 0.027795155843098957, "learning_rate": 0.0001, "loss": 3.8838, "loss/crossentropy": 2.23150634765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20408733934164047, "step": 20044 }, { "epoch": 0.40092, "grad_norm": 1.8671875, "grad_norm_var": 0.026805623372395834, "learning_rate": 0.0001, "loss": 3.9708, "loss/crossentropy": 2.086575150489807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20418858528137207, "step": 20046 }, { "epoch": 0.40096, "grad_norm": 1.9453125, "grad_norm_var": 0.026741536458333333, "learning_rate": 0.0001, "loss": 4.1508, "loss/crossentropy": 2.2025340795516968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21201904118061066, "step": 20048 }, { "epoch": 0.401, "grad_norm": 2.09375, "grad_norm_var": 0.02859471638997396, "learning_rate": 0.0001, "loss": 4.2056, "loss/crossentropy": 2.175204277038574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20766476541757584, "step": 20050 }, { "epoch": 0.40104, "grad_norm": 1.9375, "grad_norm_var": 0.027581532796223957, "learning_rate": 0.0001, "loss": 4.0223, "loss/crossentropy": 2.159320116043091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013096585869789, "step": 20052 }, { "epoch": 0.40108, "grad_norm": 2.09375, "grad_norm_var": 0.028452301025390626, "learning_rate": 0.0001, "loss": 3.8412, "loss/crossentropy": 1.5438128113746643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1641307920217514, "step": 20054 }, { "epoch": 0.40112, "grad_norm": 1.9609375, "grad_norm_var": 0.0180084228515625, "learning_rate": 0.0001, "loss": 3.8949, "loss/crossentropy": 2.0549490451812744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2176165133714676, "step": 20056 }, { "epoch": 0.40116, "grad_norm": 1.90625, "grad_norm_var": 0.01363525390625, "learning_rate": 0.0001, "loss": 4.1048, "loss/crossentropy": 1.8520516753196716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19935637712478638, "step": 20058 }, { "epoch": 0.4012, "grad_norm": 1.90625, "grad_norm_var": 0.007112375895182292, "learning_rate": 0.0001, "loss": 3.8383, "loss/crossentropy": 2.061866581439972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928197741508484, "step": 20060 }, { "epoch": 0.40124, "grad_norm": 1.859375, "grad_norm_var": 0.006780751546223958, "learning_rate": 0.0001, "loss": 3.9205, "loss/crossentropy": 2.0396493673324585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19679231941699982, "step": 20062 }, { "epoch": 0.40128, "grad_norm": 2.109375, "grad_norm_var": 0.008231353759765626, "learning_rate": 0.0001, "loss": 4.2929, "loss/crossentropy": 1.8019860982894897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17440176755189896, "step": 20064 }, { "epoch": 0.40132, "grad_norm": 1.8515625, "grad_norm_var": 0.011027018229166666, "learning_rate": 0.0001, "loss": 4.1233, "loss/crossentropy": 2.087724268436432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18905764818191528, "step": 20066 }, { "epoch": 0.40136, "grad_norm": 1.921875, "grad_norm_var": 0.0110504150390625, "learning_rate": 0.0001, "loss": 3.9207, "loss/crossentropy": 1.899372398853302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20874439179897308, "step": 20068 }, { "epoch": 0.4014, "grad_norm": 1.9140625, "grad_norm_var": 0.009073893229166666, "learning_rate": 0.0001, "loss": 3.9521, "loss/crossentropy": 1.740720272064209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21033668518066406, "step": 20070 }, { "epoch": 0.40144, "grad_norm": 1.9765625, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 3.9399, "loss/crossentropy": 2.043474793434143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20285866409540176, "step": 20072 }, { "epoch": 0.40148, "grad_norm": 1.8359375, "grad_norm_var": 0.010114542643229167, "learning_rate": 0.0001, "loss": 3.8251, "loss/crossentropy": 1.907653033733368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19814437627792358, "step": 20074 }, { "epoch": 0.40152, "grad_norm": 1.953125, "grad_norm_var": 0.0099029541015625, "learning_rate": 0.0001, "loss": 4.004, "loss/crossentropy": 1.803492784500122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19028827548027039, "step": 20076 }, { "epoch": 0.40156, "grad_norm": 1.8671875, "grad_norm_var": 0.017463175455729167, "learning_rate": 0.0001, "loss": 4.0304, "loss/crossentropy": 2.2277488708496094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069825902581215, "step": 20078 }, { "epoch": 0.4016, "grad_norm": 1.9375, "grad_norm_var": 0.01710205078125, "learning_rate": 0.0001, "loss": 4.0578, "loss/crossentropy": 1.9414427280426025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182416632771492, "step": 20080 }, { "epoch": 0.40164, "grad_norm": 1.7890625, "grad_norm_var": 0.014086659749348958, "learning_rate": 0.0001, "loss": 3.9105, "loss/crossentropy": 1.9231160879135132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17756588757038116, "step": 20082 }, { "epoch": 0.40168, "grad_norm": 2.046875, "grad_norm_var": 0.01566136678059896, "learning_rate": 0.0001, "loss": 4.2187, "loss/crossentropy": 2.1724199056625366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21560527384281158, "step": 20084 }, { "epoch": 0.40172, "grad_norm": 2.125, "grad_norm_var": 0.017365519205729166, "learning_rate": 0.0001, "loss": 4.1919, "loss/crossentropy": 1.7942206859588623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1705407276749611, "step": 20086 }, { "epoch": 0.40176, "grad_norm": 1.8984375, "grad_norm_var": 0.017183430989583335, "learning_rate": 0.0001, "loss": 4.0294, "loss/crossentropy": 2.1846182346343994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2066875621676445, "step": 20088 }, { "epoch": 0.4018, "grad_norm": 1.828125, "grad_norm_var": 0.0183746337890625, "learning_rate": 0.0001, "loss": 3.6378, "loss/crossentropy": 1.9378133416175842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18584155291318893, "step": 20090 }, { "epoch": 0.40184, "grad_norm": 1.984375, "grad_norm_var": 0.018379720052083333, "learning_rate": 0.0001, "loss": 4.2124, "loss/crossentropy": 1.9822070598602295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22574876248836517, "step": 20092 }, { "epoch": 0.40188, "grad_norm": 1.90625, "grad_norm_var": 0.009626261393229167, "learning_rate": 0.0001, "loss": 3.8348, "loss/crossentropy": 2.133267104625702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20023657381534576, "step": 20094 }, { "epoch": 0.40192, "grad_norm": 1.8984375, "grad_norm_var": 0.012465159098307291, "learning_rate": 0.0001, "loss": 3.6025, "loss/crossentropy": 1.9899320602416992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17226950079202652, "step": 20096 }, { "epoch": 0.40196, "grad_norm": 2.625, "grad_norm_var": 0.042699940999348956, "learning_rate": 0.0001, "loss": 4.6337, "loss/crossentropy": 2.3320037126541138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.33343885838985443, "step": 20098 }, { "epoch": 0.402, "grad_norm": 2.015625, "grad_norm_var": 0.04210586547851562, "learning_rate": 0.0001, "loss": 4.1064, "loss/crossentropy": 2.2026573419570923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20648924261331558, "step": 20100 }, { "epoch": 0.40204, "grad_norm": 1.859375, "grad_norm_var": 0.04067789713541667, "learning_rate": 0.0001, "loss": 4.0363, "loss/crossentropy": 2.1894484758377075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947491317987442, "step": 20102 }, { "epoch": 0.40208, "grad_norm": 1.8984375, "grad_norm_var": 0.040421549479166666, "learning_rate": 0.0001, "loss": 4.1506, "loss/crossentropy": 2.102599799633026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19991370290517807, "step": 20104 }, { "epoch": 0.40212, "grad_norm": 1.8984375, "grad_norm_var": 0.038590494791666666, "learning_rate": 0.0001, "loss": 4.0476, "loss/crossentropy": 2.0289117097854614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18713568150997162, "step": 20106 }, { "epoch": 0.40216, "grad_norm": 1.859375, "grad_norm_var": 0.039098866780598956, "learning_rate": 0.0001, "loss": 4.0656, "loss/crossentropy": 2.0425156950950623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19697192311286926, "step": 20108 }, { "epoch": 0.4022, "grad_norm": 1.8125, "grad_norm_var": 0.0391265869140625, "learning_rate": 0.0001, "loss": 3.7796, "loss/crossentropy": 1.9846921563148499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20306336879730225, "step": 20110 }, { "epoch": 0.40224, "grad_norm": 1.859375, "grad_norm_var": 0.035166168212890626, "learning_rate": 0.0001, "loss": 3.8168, "loss/crossentropy": 1.846974492073059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19833409041166306, "step": 20112 }, { "epoch": 0.40228, "grad_norm": 1.9453125, "grad_norm_var": 0.005052693684895833, "learning_rate": 0.0001, "loss": 4.0727, "loss/crossentropy": 2.3157109022140503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169334515929222, "step": 20114 }, { "epoch": 0.40232, "grad_norm": 1.9375, "grad_norm_var": 0.004349772135416667, "learning_rate": 0.0001, "loss": 4.0974, "loss/crossentropy": 2.28099262714386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20791316777467728, "step": 20116 }, { "epoch": 0.40236, "grad_norm": 1.9609375, "grad_norm_var": 0.0042073567708333336, "learning_rate": 0.0001, "loss": 4.0327, "loss/crossentropy": 2.3558366298675537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2205863893032074, "step": 20118 }, { "epoch": 0.4024, "grad_norm": 1.984375, "grad_norm_var": 0.0045074462890625, "learning_rate": 0.0001, "loss": 4.2792, "loss/crossentropy": 2.0106826424598694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982533410191536, "step": 20120 }, { "epoch": 0.40244, "grad_norm": 1.9765625, "grad_norm_var": 0.005773671468098958, "learning_rate": 0.0001, "loss": 4.2738, "loss/crossentropy": 2.321291446685791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20979785174131393, "step": 20122 }, { "epoch": 0.40248, "grad_norm": 2.375, "grad_norm_var": 0.016658528645833334, "learning_rate": 0.0001, "loss": 4.3524, "loss/crossentropy": 2.3632254600524902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2184157818555832, "step": 20124 }, { "epoch": 0.40252, "grad_norm": 2.078125, "grad_norm_var": 0.015144856770833333, "learning_rate": 0.0001, "loss": 3.959, "loss/crossentropy": 2.1488123536109924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2031005695462227, "step": 20126 }, { "epoch": 0.40256, "grad_norm": 2.203125, "grad_norm_var": 0.015533192952473959, "learning_rate": 0.0001, "loss": 4.3341, "loss/crossentropy": 1.8036405444145203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19172652810811996, "step": 20128 }, { "epoch": 0.4026, "grad_norm": 1.984375, "grad_norm_var": 0.014925130208333333, "learning_rate": 0.0001, "loss": 4.1131, "loss/crossentropy": 2.0299129486083984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20876885205507278, "step": 20130 }, { "epoch": 0.40264, "grad_norm": 2.0, "grad_norm_var": 0.014399973551432292, "learning_rate": 0.0001, "loss": 3.9618, "loss/crossentropy": 1.9177062511444092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19626032561063766, "step": 20132 }, { "epoch": 0.40268, "grad_norm": 1.859375, "grad_norm_var": 0.015897623697916665, "learning_rate": 0.0001, "loss": 4.1608, "loss/crossentropy": 2.2073251008987427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061092108488083, "step": 20134 }, { "epoch": 0.40272, "grad_norm": 1.9296875, "grad_norm_var": 0.01739476521809896, "learning_rate": 0.0001, "loss": 3.7592, "loss/crossentropy": 2.0243424773216248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109317108988762, "step": 20136 }, { "epoch": 0.40276, "grad_norm": 1.8515625, "grad_norm_var": 0.02050959269205729, "learning_rate": 0.0001, "loss": 3.845, "loss/crossentropy": 2.0052929520606995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17854592204093933, "step": 20138 }, { "epoch": 0.4028, "grad_norm": 1.9140625, "grad_norm_var": 0.010790761311848958, "learning_rate": 0.0001, "loss": 4.0656, "loss/crossentropy": 2.067071557044983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18235646188259125, "step": 20140 }, { "epoch": 0.40284, "grad_norm": 1.9453125, "grad_norm_var": 0.009354400634765624, "learning_rate": 0.0001, "loss": 4.0343, "loss/crossentropy": 2.050750732421875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19028235971927643, "step": 20142 }, { "epoch": 0.40288, "grad_norm": 2.109375, "grad_norm_var": 0.005680084228515625, "learning_rate": 0.0001, "loss": 4.2734, "loss/crossentropy": 2.1326886415481567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158709168434143, "step": 20144 }, { "epoch": 0.40292, "grad_norm": 1.8359375, "grad_norm_var": 0.006571451822916667, "learning_rate": 0.0001, "loss": 4.1029, "loss/crossentropy": 1.9970600605010986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20081285387277603, "step": 20146 }, { "epoch": 0.40296, "grad_norm": 1.9453125, "grad_norm_var": 0.006178538004557292, "learning_rate": 0.0001, "loss": 3.8243, "loss/crossentropy": 2.042023479938507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022870033979416, "step": 20148 }, { "epoch": 0.403, "grad_norm": 1.921875, "grad_norm_var": 0.006349436442057292, "learning_rate": 0.0001, "loss": 4.3217, "loss/crossentropy": 2.3379902839660645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2027904912829399, "step": 20150 }, { "epoch": 0.40304, "grad_norm": 2.03125, "grad_norm_var": 0.007080078125, "learning_rate": 0.0001, "loss": 4.0321, "loss/crossentropy": 1.804058849811554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18429075926542282, "step": 20152 }, { "epoch": 0.40308, "grad_norm": 1.984375, "grad_norm_var": 0.005716705322265625, "learning_rate": 0.0001, "loss": 4.1405, "loss/crossentropy": 1.93051016330719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1912556290626526, "step": 20154 }, { "epoch": 0.40312, "grad_norm": 1.890625, "grad_norm_var": 0.004634348551432291, "learning_rate": 0.0001, "loss": 4.0571, "loss/crossentropy": 2.3044906854629517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22676818072795868, "step": 20156 }, { "epoch": 0.40316, "grad_norm": 2.0, "grad_norm_var": 0.0047686258951822914, "learning_rate": 0.0001, "loss": 4.1262, "loss/crossentropy": 2.1352078914642334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985031336545944, "step": 20158 }, { "epoch": 0.4032, "grad_norm": 1.9453125, "grad_norm_var": 0.0031565348307291668, "learning_rate": 0.0001, "loss": 4.1973, "loss/crossentropy": 2.0967469811439514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917915642261505, "step": 20160 }, { "epoch": 0.40324, "grad_norm": 1.875, "grad_norm_var": 0.0033078511555989583, "learning_rate": 0.0001, "loss": 4.2332, "loss/crossentropy": 2.1989063024520874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067013531923294, "step": 20162 }, { "epoch": 0.40328, "grad_norm": 1.9765625, "grad_norm_var": 0.0032623291015625, "learning_rate": 0.0001, "loss": 4.144, "loss/crossentropy": 2.047918140888214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19593901932239532, "step": 20164 }, { "epoch": 0.40332, "grad_norm": 2.046875, "grad_norm_var": 0.003824615478515625, "learning_rate": 0.0001, "loss": 4.1674, "loss/crossentropy": 2.258637487888336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20108382403850555, "step": 20166 }, { "epoch": 0.40336, "grad_norm": 2.03125, "grad_norm_var": 0.003574371337890625, "learning_rate": 0.0001, "loss": 4.3006, "loss/crossentropy": 2.4297776222229004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21948332339525223, "step": 20168 }, { "epoch": 0.4034, "grad_norm": 2.046875, "grad_norm_var": 0.007413482666015625, "learning_rate": 0.0001, "loss": 4.1809, "loss/crossentropy": 2.029311180114746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19954024255275726, "step": 20170 }, { "epoch": 0.40344, "grad_norm": 1.8671875, "grad_norm_var": 0.008123524983723958, "learning_rate": 0.0001, "loss": 4.0891, "loss/crossentropy": 2.099509835243225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19811799377202988, "step": 20172 }, { "epoch": 0.40348, "grad_norm": 1.8359375, "grad_norm_var": 0.009732818603515625, "learning_rate": 0.0001, "loss": 3.996, "loss/crossentropy": 1.9699227809906006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16898275911808014, "step": 20174 }, { "epoch": 0.40352, "grad_norm": 1.8515625, "grad_norm_var": 0.010668690999348958, "learning_rate": 0.0001, "loss": 4.0507, "loss/crossentropy": 2.0661654472351074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20182892680168152, "step": 20176 }, { "epoch": 0.40356, "grad_norm": 1.859375, "grad_norm_var": 0.0108306884765625, "learning_rate": 0.0001, "loss": 4.0142, "loss/crossentropy": 2.0732903480529785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1912563294172287, "step": 20178 }, { "epoch": 0.4036, "grad_norm": 1.96875, "grad_norm_var": 0.010578409830729166, "learning_rate": 0.0001, "loss": 4.0337, "loss/crossentropy": 1.9951130747795105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17538709938526154, "step": 20180 }, { "epoch": 0.40364, "grad_norm": 1.921875, "grad_norm_var": 0.010117340087890624, "learning_rate": 0.0001, "loss": 3.994, "loss/crossentropy": 1.8298532366752625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18598610162734985, "step": 20182 }, { "epoch": 0.40368, "grad_norm": 2.5, "grad_norm_var": 0.028742472330729168, "learning_rate": 0.0001, "loss": 4.3272, "loss/crossentropy": 2.19102144241333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25994810461997986, "step": 20184 }, { "epoch": 0.40372, "grad_norm": 2.03125, "grad_norm_var": 0.02552490234375, "learning_rate": 0.0001, "loss": 4.1003, "loss/crossentropy": 2.021101176738739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19094721227884293, "step": 20186 }, { "epoch": 0.40376, "grad_norm": 1.96875, "grad_norm_var": 0.02437108357747396, "learning_rate": 0.0001, "loss": 4.0057, "loss/crossentropy": 1.9469704627990723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19880877435207367, "step": 20188 }, { "epoch": 0.4038, "grad_norm": 1.9375, "grad_norm_var": 0.023374176025390624, "learning_rate": 0.0001, "loss": 4.1993, "loss/crossentropy": 2.0968725085258484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23307666182518005, "step": 20190 }, { "epoch": 0.40384, "grad_norm": 1.828125, "grad_norm_var": 0.0236572265625, "learning_rate": 0.0001, "loss": 4.1409, "loss/crossentropy": 1.7614120244979858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854785978794098, "step": 20192 }, { "epoch": 0.40388, "grad_norm": 1.875, "grad_norm_var": 0.021714019775390624, "learning_rate": 0.0001, "loss": 3.9778, "loss/crossentropy": 2.1977179050445557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19997768104076385, "step": 20194 }, { "epoch": 0.40392, "grad_norm": 1.984375, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 3.6169, "loss/crossentropy": 1.6404736638069153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18520022183656693, "step": 20196 }, { "epoch": 0.40396, "grad_norm": 1.9375, "grad_norm_var": 0.022345987955729167, "learning_rate": 0.0001, "loss": 4.3767, "loss/crossentropy": 2.4365806579589844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23998292535543442, "step": 20198 }, { "epoch": 0.404, "grad_norm": 1.84375, "grad_norm_var": 0.005098215738932292, "learning_rate": 0.0001, "loss": 4.026, "loss/crossentropy": 2.191064238548279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20327729731798172, "step": 20200 }, { "epoch": 0.40404, "grad_norm": 1.875, "grad_norm_var": 0.004992421468098958, "learning_rate": 0.0001, "loss": 3.9392, "loss/crossentropy": 2.1551318764686584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.226949080824852, "step": 20202 }, { "epoch": 0.40408, "grad_norm": 2.390625, "grad_norm_var": 0.017463175455729167, "learning_rate": 0.0001, "loss": 4.4831, "loss/crossentropy": 2.4735565185546875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20860131084918976, "step": 20204 }, { "epoch": 0.40412, "grad_norm": 2.046875, "grad_norm_var": 0.017288970947265624, "learning_rate": 0.0001, "loss": 4.1105, "loss/crossentropy": 2.1407764554023743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044227048754692, "step": 20206 }, { "epoch": 0.40416, "grad_norm": 2.046875, "grad_norm_var": 0.016471099853515626, "learning_rate": 0.0001, "loss": 4.3293, "loss/crossentropy": 2.1775436401367188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21017960458993912, "step": 20208 }, { "epoch": 0.4042, "grad_norm": 2.015625, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 3.9514, "loss/crossentropy": 1.8976858854293823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17403876036405563, "step": 20210 }, { "epoch": 0.40424, "grad_norm": 2.046875, "grad_norm_var": 0.018143463134765624, "learning_rate": 0.0001, "loss": 3.7933, "loss/crossentropy": 1.7951022386550903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805388182401657, "step": 20212 }, { "epoch": 0.40428, "grad_norm": 1.984375, "grad_norm_var": 0.01911188761393229, "learning_rate": 0.0001, "loss": 4.0215, "loss/crossentropy": 2.0954891443252563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20514150708913803, "step": 20214 }, { "epoch": 0.40432, "grad_norm": 2.046875, "grad_norm_var": 0.018212636311848957, "learning_rate": 0.0001, "loss": 4.1753, "loss/crossentropy": 2.125544309616089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19341818243265152, "step": 20216 }, { "epoch": 0.40436, "grad_norm": 2.125, "grad_norm_var": 0.01802546183268229, "learning_rate": 0.0001, "loss": 4.1361, "loss/crossentropy": 1.7649177312850952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18554037809371948, "step": 20218 }, { "epoch": 0.4044, "grad_norm": 2.0, "grad_norm_var": 0.007529449462890625, "learning_rate": 0.0001, "loss": 4.1548, "loss/crossentropy": 2.1141446232795715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219147227704525, "step": 20220 }, { "epoch": 0.40444, "grad_norm": 1.9453125, "grad_norm_var": 0.007328033447265625, "learning_rate": 0.0001, "loss": 3.803, "loss/crossentropy": 1.9603995084762573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19251137226819992, "step": 20222 }, { "epoch": 0.40448, "grad_norm": 1.9765625, "grad_norm_var": 0.006941731770833333, "learning_rate": 0.0001, "loss": 4.1023, "loss/crossentropy": 2.1824519634246826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134433463215828, "step": 20224 }, { "epoch": 0.40452, "grad_norm": 1.84375, "grad_norm_var": 0.005796051025390625, "learning_rate": 0.0001, "loss": 4.0501, "loss/crossentropy": 2.0491157174110413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19854432344436646, "step": 20226 }, { "epoch": 0.40456, "grad_norm": 1.828125, "grad_norm_var": 0.007108561197916667, "learning_rate": 0.0001, "loss": 3.9262, "loss/crossentropy": 2.3731456995010376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2264491394162178, "step": 20228 }, { "epoch": 0.4046, "grad_norm": 1.859375, "grad_norm_var": 0.006150054931640625, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 1.9901636242866516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17958952486515045, "step": 20230 }, { "epoch": 0.40464, "grad_norm": 2.09375, "grad_norm_var": 0.0072265625, "learning_rate": 0.0001, "loss": 4.2473, "loss/crossentropy": 1.9541369080543518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19508858770132065, "step": 20232 }, { "epoch": 0.40468, "grad_norm": 1.9296875, "grad_norm_var": 0.0056149800618489586, "learning_rate": 0.0001, "loss": 3.8702, "loss/crossentropy": 1.9723637700080872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17709529399871826, "step": 20234 }, { "epoch": 0.40472, "grad_norm": 2.046875, "grad_norm_var": 0.006669108072916667, "learning_rate": 0.0001, "loss": 4.1318, "loss/crossentropy": 2.2354401350021362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117457002401352, "step": 20236 }, { "epoch": 0.40476, "grad_norm": 1.90625, "grad_norm_var": 0.017787424723307292, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 2.1230952739715576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19268101453781128, "step": 20238 }, { "epoch": 0.4048, "grad_norm": 1.9140625, "grad_norm_var": 0.024057769775390626, "learning_rate": 0.0001, "loss": 4.1973, "loss/crossentropy": 2.250411033630371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24458947777748108, "step": 20240 }, { "epoch": 0.40484, "grad_norm": 1.9921875, "grad_norm_var": 0.021882120768229166, "learning_rate": 0.0001, "loss": 4.0774, "loss/crossentropy": 2.0546218752861023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20313245803117752, "step": 20242 }, { "epoch": 0.40488, "grad_norm": 1.9140625, "grad_norm_var": 0.020808664957682292, "learning_rate": 0.0001, "loss": 3.827, "loss/crossentropy": 1.8805654644966125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19259311258792877, "step": 20244 }, { "epoch": 0.40492, "grad_norm": 1.8828125, "grad_norm_var": 0.023209381103515624, "learning_rate": 0.0001, "loss": 3.7371, "loss/crossentropy": 1.9676281809806824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1743411347270012, "step": 20246 }, { "epoch": 0.40496, "grad_norm": 1.8984375, "grad_norm_var": 0.024894205729166667, "learning_rate": 0.0001, "loss": 3.7542, "loss/crossentropy": 2.2166742086410522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19619230180978775, "step": 20248 }, { "epoch": 0.405, "grad_norm": 1.828125, "grad_norm_var": 0.027103424072265625, "learning_rate": 0.0001, "loss": 4.2001, "loss/crossentropy": 2.2599674463272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19451096653938293, "step": 20250 }, { "epoch": 0.40504, "grad_norm": 1.8828125, "grad_norm_var": 0.027296702067057293, "learning_rate": 0.0001, "loss": 3.7535, "loss/crossentropy": 2.206246018409729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20522233098745346, "step": 20252 }, { "epoch": 0.40508, "grad_norm": 2.171875, "grad_norm_var": 0.01790135701497396, "learning_rate": 0.0001, "loss": 4.0881, "loss/crossentropy": 2.0229114294052124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21543244272470474, "step": 20254 }, { "epoch": 0.40512, "grad_norm": 2.0, "grad_norm_var": 0.00960693359375, "learning_rate": 0.0001, "loss": 4.0279, "loss/crossentropy": 2.245228886604309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20587065815925598, "step": 20256 }, { "epoch": 0.40516, "grad_norm": 1.8828125, "grad_norm_var": 0.009368642171223959, "learning_rate": 0.0001, "loss": 3.934, "loss/crossentropy": 2.030737340450287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20048178732395172, "step": 20258 }, { "epoch": 0.4052, "grad_norm": 1.8203125, "grad_norm_var": 0.010027821858723958, "learning_rate": 0.0001, "loss": 4.0323, "loss/crossentropy": 2.084582805633545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19962488114833832, "step": 20260 }, { "epoch": 0.40524, "grad_norm": 1.9296875, "grad_norm_var": 0.009075673421223958, "learning_rate": 0.0001, "loss": 3.8496, "loss/crossentropy": 2.2461657524108887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21189837902784348, "step": 20262 }, { "epoch": 0.40528, "grad_norm": 2.03125, "grad_norm_var": 0.0095123291015625, "learning_rate": 0.0001, "loss": 4.0405, "loss/crossentropy": 1.9522897005081177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18683286011219025, "step": 20264 }, { "epoch": 0.40532, "grad_norm": 1.8359375, "grad_norm_var": 0.008104451497395833, "learning_rate": 0.0001, "loss": 3.9797, "loss/crossentropy": 1.9156713485717773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18374722450971603, "step": 20266 }, { "epoch": 0.40536, "grad_norm": 2.96875, "grad_norm_var": 0.07459208170572916, "learning_rate": 0.0001, "loss": 4.2083, "loss/crossentropy": 2.2467408180236816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2305024415254593, "step": 20268 }, { "epoch": 0.4054, "grad_norm": 1.921875, "grad_norm_var": 0.07486063639322917, "learning_rate": 0.0001, "loss": 3.9249, "loss/crossentropy": 2.0827468633651733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20297909528017044, "step": 20270 }, { "epoch": 0.40544, "grad_norm": 2.0625, "grad_norm_var": 0.07484130859375, "learning_rate": 0.0001, "loss": 4.1911, "loss/crossentropy": 2.1979604959487915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19396250694990158, "step": 20272 }, { "epoch": 0.40548, "grad_norm": 1.953125, "grad_norm_var": 0.0729156494140625, "learning_rate": 0.0001, "loss": 4.0384, "loss/crossentropy": 2.0157305002212524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022785097360611, "step": 20274 }, { "epoch": 0.40552, "grad_norm": 2.109375, "grad_norm_var": 0.06889012654622396, "learning_rate": 0.0001, "loss": 4.1509, "loss/crossentropy": 1.838208019733429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19091250747442245, "step": 20276 }, { "epoch": 0.40556, "grad_norm": 1.8828125, "grad_norm_var": 0.06746800740559895, "learning_rate": 0.0001, "loss": 4.1206, "loss/crossentropy": 2.1762728691101074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19559895247220993, "step": 20278 }, { "epoch": 0.4056, "grad_norm": 2.03125, "grad_norm_var": 0.06892903645833333, "learning_rate": 0.0001, "loss": 4.2381, "loss/crossentropy": 2.2119935154914856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954266056418419, "step": 20280 }, { "epoch": 0.40564, "grad_norm": 2.0, "grad_norm_var": 0.0665728251139323, "learning_rate": 0.0001, "loss": 3.9296, "loss/crossentropy": 1.9655035138130188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19419729709625244, "step": 20282 }, { "epoch": 0.40568, "grad_norm": 1.8828125, "grad_norm_var": 0.009501139322916666, "learning_rate": 0.0001, "loss": 3.8007, "loss/crossentropy": 1.7471181750297546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15437456965446472, "step": 20284 }, { "epoch": 0.40572, "grad_norm": 1.96875, "grad_norm_var": 0.006799062093098958, "learning_rate": 0.0001, "loss": 3.9379, "loss/crossentropy": 1.8075406551361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18546781688928604, "step": 20286 }, { "epoch": 0.40576, "grad_norm": 2.015625, "grad_norm_var": 0.006925455729166667, "learning_rate": 0.0001, "loss": 4.2841, "loss/crossentropy": 2.220236897468567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20516446232795715, "step": 20288 }, { "epoch": 0.4058, "grad_norm": 1.828125, "grad_norm_var": 0.006965128580729166, "learning_rate": 0.0001, "loss": 4.0263, "loss/crossentropy": 1.887015163898468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1859249323606491, "step": 20290 }, { "epoch": 0.40584, "grad_norm": 1.9375, "grad_norm_var": 0.004776763916015625, "learning_rate": 0.0001, "loss": 4.1448, "loss/crossentropy": 2.2080272436141968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20920713245868683, "step": 20292 }, { "epoch": 0.40588, "grad_norm": 2.109375, "grad_norm_var": 0.005722808837890625, "learning_rate": 0.0001, "loss": 4.2826, "loss/crossentropy": 1.9298865795135498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954272910952568, "step": 20294 }, { "epoch": 0.40592, "grad_norm": 1.984375, "grad_norm_var": 0.0064208984375, "learning_rate": 0.0001, "loss": 3.9118, "loss/crossentropy": 1.914223849773407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010154277086258, "step": 20296 }, { "epoch": 0.40596, "grad_norm": 1.9375, "grad_norm_var": 0.005782063802083333, "learning_rate": 0.0001, "loss": 3.7257, "loss/crossentropy": 1.7863758206367493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19244936853647232, "step": 20298 }, { "epoch": 0.406, "grad_norm": 1.8828125, "grad_norm_var": 0.005782063802083333, "learning_rate": 0.0001, "loss": 3.8874, "loss/crossentropy": 2.0487464666366577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20337236672639847, "step": 20300 }, { "epoch": 0.40604, "grad_norm": 2.0625, "grad_norm_var": 0.007281239827473958, "learning_rate": 0.0001, "loss": 3.8666, "loss/crossentropy": 1.8796595335006714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18166716396808624, "step": 20302 }, { "epoch": 0.40608, "grad_norm": 2.03125, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 4.0397, "loss/crossentropy": 2.0738844871520996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000334084033966, "step": 20304 }, { "epoch": 0.40612, "grad_norm": 1.953125, "grad_norm_var": 0.0070709228515625, "learning_rate": 0.0001, "loss": 3.8526, "loss/crossentropy": 2.392453908920288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19829116761684418, "step": 20306 }, { "epoch": 0.40616, "grad_norm": 2.03125, "grad_norm_var": 0.010117340087890624, "learning_rate": 0.0001, "loss": 4.0658, "loss/crossentropy": 2.137113928794861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19673822075128555, "step": 20308 }, { "epoch": 0.4062, "grad_norm": 1.9296875, "grad_norm_var": 0.008593495686848958, "learning_rate": 0.0001, "loss": 4.1409, "loss/crossentropy": 2.0520911812782288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19579464942216873, "step": 20310 }, { "epoch": 0.40624, "grad_norm": 1.8828125, "grad_norm_var": 0.007303873697916667, "learning_rate": 0.0001, "loss": 4.2045, "loss/crossentropy": 1.9204559922218323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19526654481887817, "step": 20312 }, { "epoch": 0.40628, "grad_norm": 1.96875, "grad_norm_var": 0.009822336832682292, "learning_rate": 0.0001, "loss": 3.9996, "loss/crossentropy": 2.0489402413368225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19304056465625763, "step": 20314 }, { "epoch": 0.40632, "grad_norm": 2.4375, "grad_norm_var": 10.828910319010417, "learning_rate": 0.0001, "loss": 5.057, "loss/crossentropy": 2.127828896045685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21591445803642273, "step": 20316 }, { "epoch": 0.40636, "grad_norm": 1.9453125, "grad_norm_var": 10.844437408447266, "learning_rate": 0.0001, "loss": 4.0046, "loss/crossentropy": 1.9867295026779175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19691205769777298, "step": 20318 }, { "epoch": 0.4064, "grad_norm": 2.0, "grad_norm_var": 10.848514811197917, "learning_rate": 0.0001, "loss": 4.1028, "loss/crossentropy": 2.324278950691223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2344008907675743, "step": 20320 }, { "epoch": 0.40644, "grad_norm": 2.15625, "grad_norm_var": 10.786188761393229, "learning_rate": 0.0001, "loss": 4.3306, "loss/crossentropy": 2.2904698848724365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21537532657384872, "step": 20322 }, { "epoch": 0.40648, "grad_norm": 1.96875, "grad_norm_var": 10.77518081665039, "learning_rate": 0.0001, "loss": 3.9667, "loss/crossentropy": 2.0894381999969482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012424886226654, "step": 20324 }, { "epoch": 0.40652, "grad_norm": 1.984375, "grad_norm_var": 10.77937723795573, "learning_rate": 0.0001, "loss": 4.0753, "loss/crossentropy": 2.109618663787842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21001552045345306, "step": 20326 }, { "epoch": 0.40656, "grad_norm": 2.078125, "grad_norm_var": 10.745448557535807, "learning_rate": 0.0001, "loss": 4.1139, "loss/crossentropy": 2.048314094543457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19623054563999176, "step": 20328 }, { "epoch": 0.4066, "grad_norm": 2.09375, "grad_norm_var": 10.683837636311848, "learning_rate": 0.0001, "loss": 4.3341, "loss/crossentropy": 2.2704076766967773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24112576246261597, "step": 20330 }, { "epoch": 0.40664, "grad_norm": 2.015625, "grad_norm_var": 0.010057576497395833, "learning_rate": 0.0001, "loss": 3.9603, "loss/crossentropy": 2.1316241025924683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20026037096977234, "step": 20332 }, { "epoch": 0.40668, "grad_norm": 2.34375, "grad_norm_var": 0.015740712483723957, "learning_rate": 0.0001, "loss": 4.2839, "loss/crossentropy": 2.1923957467079163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25954584032297134, "step": 20334 }, { "epoch": 0.40672, "grad_norm": 2.015625, "grad_norm_var": 0.01502685546875, "learning_rate": 0.0001, "loss": 4.0627, "loss/crossentropy": 2.228965699672699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21487966179847717, "step": 20336 }, { "epoch": 0.40676, "grad_norm": 1.8125, "grad_norm_var": 0.01531982421875, "learning_rate": 0.0001, "loss": 3.8906, "loss/crossentropy": 1.9097226858139038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1784234344959259, "step": 20338 }, { "epoch": 0.4068, "grad_norm": 1.8046875, "grad_norm_var": 0.020646158854166666, "learning_rate": 0.0001, "loss": 4.1256, "loss/crossentropy": 2.125716209411621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19827620685100555, "step": 20340 }, { "epoch": 0.40684, "grad_norm": 1.90625, "grad_norm_var": 0.02122802734375, "learning_rate": 0.0001, "loss": 4.1436, "loss/crossentropy": 2.025933086872101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20766756683588028, "step": 20342 }, { "epoch": 0.40688, "grad_norm": 1.9453125, "grad_norm_var": 0.022638956705729168, "learning_rate": 0.0001, "loss": 3.7289, "loss/crossentropy": 2.0030741095542908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19484290480613708, "step": 20344 }, { "epoch": 0.40692, "grad_norm": 1.9765625, "grad_norm_var": 0.02196019490559896, "learning_rate": 0.0001, "loss": 4.2287, "loss/crossentropy": 2.402723550796509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21893122792243958, "step": 20346 }, { "epoch": 0.40696, "grad_norm": 1.9140625, "grad_norm_var": 0.0221343994140625, "learning_rate": 0.0001, "loss": 3.8234, "loss/crossentropy": 2.077975869178772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20634907484054565, "step": 20348 }, { "epoch": 0.407, "grad_norm": 1.9765625, "grad_norm_var": 0.015306599934895833, "learning_rate": 0.0001, "loss": 3.7596, "loss/crossentropy": 1.86312997341156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18826880306005478, "step": 20350 }, { "epoch": 0.40704, "grad_norm": 1.9765625, "grad_norm_var": 0.015225982666015625, "learning_rate": 0.0001, "loss": 3.982, "loss/crossentropy": 2.090232253074646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20021233707666397, "step": 20352 }, { "epoch": 0.40708, "grad_norm": 1.7890625, "grad_norm_var": 0.015851847330729165, "learning_rate": 0.0001, "loss": 3.9013, "loss/crossentropy": 2.056272864341736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975172832608223, "step": 20354 }, { "epoch": 0.40712, "grad_norm": 2.03125, "grad_norm_var": 0.008689117431640626, "learning_rate": 0.0001, "loss": 4.2957, "loss/crossentropy": 2.0955827832221985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951812580227852, "step": 20356 }, { "epoch": 0.40716, "grad_norm": 1.9609375, "grad_norm_var": 0.007950846354166667, "learning_rate": 0.0001, "loss": 4.0598, "loss/crossentropy": 1.6103880405426025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17600858211517334, "step": 20358 }, { "epoch": 0.4072, "grad_norm": 1.9375, "grad_norm_var": 0.00782470703125, "learning_rate": 0.0001, "loss": 4.1568, "loss/crossentropy": 2.2216384410858154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22146118432283401, "step": 20360 }, { "epoch": 0.40724, "grad_norm": 2.0625, "grad_norm_var": 0.007033030192057292, "learning_rate": 0.0001, "loss": 4.3917, "loss/crossentropy": 2.1781840324401855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19006534665822983, "step": 20362 }, { "epoch": 0.40728, "grad_norm": 1.9453125, "grad_norm_var": 0.0074371337890625, "learning_rate": 0.0001, "loss": 3.9049, "loss/crossentropy": 1.8517277240753174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17870192229747772, "step": 20364 }, { "epoch": 0.40732, "grad_norm": 1.984375, "grad_norm_var": 0.006105295817057292, "learning_rate": 0.0001, "loss": 4.1469, "loss/crossentropy": 2.0296677947044373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20260153710842133, "step": 20366 }, { "epoch": 0.40736, "grad_norm": 1.8671875, "grad_norm_var": 0.006046295166015625, "learning_rate": 0.0001, "loss": 4.0516, "loss/crossentropy": 2.088002324104309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1896708756685257, "step": 20368 }, { "epoch": 0.4074, "grad_norm": 1.8203125, "grad_norm_var": 0.005122629801432291, "learning_rate": 0.0001, "loss": 4.0888, "loss/crossentropy": 2.0305171608924866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964031085371971, "step": 20370 }, { "epoch": 0.40744, "grad_norm": 1.9921875, "grad_norm_var": 0.0046770731608072914, "learning_rate": 0.0001, "loss": 4.0663, "loss/crossentropy": 1.9005139470100403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981399655342102, "step": 20372 }, { "epoch": 0.40748, "grad_norm": 1.9609375, "grad_norm_var": 0.0036333719889322918, "learning_rate": 0.0001, "loss": 4.0646, "loss/crossentropy": 2.1551105976104736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21034018695354462, "step": 20374 }, { "epoch": 0.40752, "grad_norm": 1.875, "grad_norm_var": 0.00372314453125, "learning_rate": 0.0001, "loss": 3.9726, "loss/crossentropy": 2.0319311022758484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1935891956090927, "step": 20376 }, { "epoch": 0.40756, "grad_norm": 2.09375, "grad_norm_var": 0.0052530924479166664, "learning_rate": 0.0001, "loss": 4.028, "loss/crossentropy": 2.06658136844635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19683608412742615, "step": 20378 }, { "epoch": 0.4076, "grad_norm": 2.328125, "grad_norm_var": 0.01574885050455729, "learning_rate": 0.0001, "loss": 4.5496, "loss/crossentropy": 2.667450428009033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24288231134414673, "step": 20380 }, { "epoch": 0.40764, "grad_norm": 1.7734375, "grad_norm_var": 0.021061197916666666, "learning_rate": 0.0001, "loss": 3.6242, "loss/crossentropy": 1.9372497200965881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18491743505001068, "step": 20382 }, { "epoch": 0.40768, "grad_norm": 1.8359375, "grad_norm_var": 0.021512858072916665, "learning_rate": 0.0001, "loss": 3.9358, "loss/crossentropy": 1.96478271484375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18305277079343796, "step": 20384 }, { "epoch": 0.40772, "grad_norm": 1.984375, "grad_norm_var": 0.020096842447916666, "learning_rate": 0.0001, "loss": 4.0033, "loss/crossentropy": 1.813466727733612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827232465147972, "step": 20386 }, { "epoch": 0.40776, "grad_norm": 1.8671875, "grad_norm_var": 0.020420074462890625, "learning_rate": 0.0001, "loss": 3.9655, "loss/crossentropy": 1.6925603151321411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17452678084373474, "step": 20388 }, { "epoch": 0.4078, "grad_norm": 2.046875, "grad_norm_var": 0.02113622029622396, "learning_rate": 0.0001, "loss": 4.0845, "loss/crossentropy": 1.8859028220176697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187887504696846, "step": 20390 }, { "epoch": 0.40784, "grad_norm": 2.046875, "grad_norm_var": 0.02066218058268229, "learning_rate": 0.0001, "loss": 3.9992, "loss/crossentropy": 2.0689834356307983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20971451699733734, "step": 20392 }, { "epoch": 0.40788, "grad_norm": 1.9296875, "grad_norm_var": 0.0203033447265625, "learning_rate": 0.0001, "loss": 4.2332, "loss/crossentropy": 2.237601161003113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20401395112276077, "step": 20394 }, { "epoch": 0.40792, "grad_norm": 1.859375, "grad_norm_var": 0.008560943603515624, "learning_rate": 0.0001, "loss": 3.9913, "loss/crossentropy": 1.8701192736625671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192036435008049, "step": 20396 }, { "epoch": 0.40796, "grad_norm": 1.890625, "grad_norm_var": 0.005680084228515625, "learning_rate": 0.0001, "loss": 4.2942, "loss/crossentropy": 2.3522990942001343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2178252711892128, "step": 20398 }, { "epoch": 0.408, "grad_norm": 2.03125, "grad_norm_var": 0.005069732666015625, "learning_rate": 0.0001, "loss": 4.245, "loss/crossentropy": 2.267996072769165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22347165644168854, "step": 20400 }, { "epoch": 0.40804, "grad_norm": 1.9765625, "grad_norm_var": 0.005208333333333333, "learning_rate": 0.0001, "loss": 4.079, "loss/crossentropy": 2.274789035320282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2143096998333931, "step": 20402 }, { "epoch": 0.40808, "grad_norm": 2.015625, "grad_norm_var": 0.004797108968098958, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 2.0331249237060547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19468723237514496, "step": 20404 }, { "epoch": 0.40812, "grad_norm": 1.875, "grad_norm_var": 0.004980214436848958, "learning_rate": 0.0001, "loss": 3.9698, "loss/crossentropy": 1.9442223906517029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19499845802783966, "step": 20406 }, { "epoch": 0.40816, "grad_norm": 1.96875, "grad_norm_var": 0.00697021484375, "learning_rate": 0.0001, "loss": 3.6943, "loss/crossentropy": 1.9122044444084167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19391249865293503, "step": 20408 }, { "epoch": 0.4082, "grad_norm": 1.890625, "grad_norm_var": 0.005651601155598958, "learning_rate": 0.0001, "loss": 3.9492, "loss/crossentropy": 1.9939789175987244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18136083334684372, "step": 20410 }, { "epoch": 0.40824, "grad_norm": 1.828125, "grad_norm_var": 0.006036122639973958, "learning_rate": 0.0001, "loss": 3.7378, "loss/crossentropy": 2.212642192840576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18931927531957626, "step": 20412 }, { "epoch": 0.40828, "grad_norm": 1.9921875, "grad_norm_var": 0.006990559895833333, "learning_rate": 0.0001, "loss": 4.1713, "loss/crossentropy": 1.8282644152641296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190102256834507, "step": 20414 }, { "epoch": 0.40832, "grad_norm": 1.9765625, "grad_norm_var": 0.00745849609375, "learning_rate": 0.0001, "loss": 3.8334, "loss/crossentropy": 2.176365852355957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19906923174858093, "step": 20416 }, { "epoch": 0.40836, "grad_norm": 1.90625, "grad_norm_var": 0.0073626200358072914, "learning_rate": 0.0001, "loss": 4.1297, "loss/crossentropy": 2.1126151084899902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.180545412003994, "step": 20418 }, { "epoch": 0.4084, "grad_norm": 1.8984375, "grad_norm_var": 0.00784912109375, "learning_rate": 0.0001, "loss": 4.1878, "loss/crossentropy": 1.834926426410675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1794508993625641, "step": 20420 }, { "epoch": 0.40844, "grad_norm": 1.8671875, "grad_norm_var": 0.007746378580729167, "learning_rate": 0.0001, "loss": 4.0114, "loss/crossentropy": 2.2909014225006104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21252857148647308, "step": 20422 }, { "epoch": 0.40848, "grad_norm": 2.015625, "grad_norm_var": 0.005866495768229166, "learning_rate": 0.0001, "loss": 4.3771, "loss/crossentropy": 2.0966813564300537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197954960167408, "step": 20424 }, { "epoch": 0.40852, "grad_norm": 1.890625, "grad_norm_var": 0.005228424072265625, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 2.0520461201667786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19458714127540588, "step": 20426 }, { "epoch": 0.40856, "grad_norm": 2.046875, "grad_norm_var": 0.006400553385416666, "learning_rate": 0.0001, "loss": 4.4592, "loss/crossentropy": 2.2313653230667114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.227764293551445, "step": 20428 }, { "epoch": 0.4086, "grad_norm": 1.921875, "grad_norm_var": 0.006956990559895833, "learning_rate": 0.0001, "loss": 4.0398, "loss/crossentropy": 2.2874799966812134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21484653651714325, "step": 20430 }, { "epoch": 0.40864, "grad_norm": 1.9609375, "grad_norm_var": 0.006400553385416666, "learning_rate": 0.0001, "loss": 4.1991, "loss/crossentropy": 2.2390183806419373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21978556364774704, "step": 20432 }, { "epoch": 0.40868, "grad_norm": 1.8125, "grad_norm_var": 0.008742014567057291, "learning_rate": 0.0001, "loss": 4.0057, "loss/crossentropy": 2.2040212154388428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21448630839586258, "step": 20434 }, { "epoch": 0.40872, "grad_norm": 1.9296875, "grad_norm_var": 0.008356730143229166, "learning_rate": 0.0001, "loss": 4.1285, "loss/crossentropy": 2.2119942903518677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19829177856445312, "step": 20436 }, { "epoch": 0.40876, "grad_norm": 2.015625, "grad_norm_var": 0.007897694905598959, "learning_rate": 0.0001, "loss": 3.819, "loss/crossentropy": 1.769053339958191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17577596008777618, "step": 20438 }, { "epoch": 0.4088, "grad_norm": 1.8515625, "grad_norm_var": 0.009226226806640625, "learning_rate": 0.0001, "loss": 4.0063, "loss/crossentropy": 2.1607295274734497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20044966787099838, "step": 20440 }, { "epoch": 0.40884, "grad_norm": 1.8984375, "grad_norm_var": 0.009228515625, "learning_rate": 0.0001, "loss": 4.0617, "loss/crossentropy": 1.4168038368225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18440894782543182, "step": 20442 }, { "epoch": 0.40888, "grad_norm": 2.0, "grad_norm_var": 0.005490875244140625, "learning_rate": 0.0001, "loss": 4.2437, "loss/crossentropy": 2.1797818541526794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22209269553422928, "step": 20444 }, { "epoch": 0.40892, "grad_norm": 1.9453125, "grad_norm_var": 0.005086008707682292, "learning_rate": 0.0001, "loss": 4.1425, "loss/crossentropy": 2.255184292793274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20536670833826065, "step": 20446 }, { "epoch": 0.40896, "grad_norm": 1.9296875, "grad_norm_var": 0.005191802978515625, "learning_rate": 0.0001, "loss": 4.1206, "loss/crossentropy": 1.9633015990257263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20300059020519257, "step": 20448 }, { "epoch": 0.409, "grad_norm": 1.8984375, "grad_norm_var": 0.002685292561848958, "learning_rate": 0.0001, "loss": 3.9409, "loss/crossentropy": 2.0943238735198975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932680904865265, "step": 20450 }, { "epoch": 0.40904, "grad_norm": 2.1875, "grad_norm_var": 0.0073321024576822914, "learning_rate": 0.0001, "loss": 4.3528, "loss/crossentropy": 2.3319740295410156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24838463962078094, "step": 20452 }, { "epoch": 0.40908, "grad_norm": 1.8828125, "grad_norm_var": 0.0081787109375, "learning_rate": 0.0001, "loss": 3.9134, "loss/crossentropy": 2.0098442435264587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19818133860826492, "step": 20454 }, { "epoch": 0.40912, "grad_norm": 2.40625, "grad_norm_var": 0.020344034830729166, "learning_rate": 0.0001, "loss": 4.048, "loss/crossentropy": 1.9752068519592285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20243104547262192, "step": 20456 }, { "epoch": 0.40916, "grad_norm": 2.015625, "grad_norm_var": 0.024006144205729166, "learning_rate": 0.0001, "loss": 4.2057, "loss/crossentropy": 2.2999367713928223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22019093483686447, "step": 20458 }, { "epoch": 0.4092, "grad_norm": 1.8828125, "grad_norm_var": 0.026395416259765624, "learning_rate": 0.0001, "loss": 3.9169, "loss/crossentropy": 1.770751178264618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787494421005249, "step": 20460 }, { "epoch": 0.40924, "grad_norm": 1.953125, "grad_norm_var": 0.026667277018229168, "learning_rate": 0.0001, "loss": 3.9317, "loss/crossentropy": 2.0412501096725464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063281610608101, "step": 20462 }, { "epoch": 0.40928, "grad_norm": 1.9140625, "grad_norm_var": 0.027733357747395833, "learning_rate": 0.0001, "loss": 3.9852, "loss/crossentropy": 2.3771307468414307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21928168088197708, "step": 20464 }, { "epoch": 0.40932, "grad_norm": 1.9765625, "grad_norm_var": 0.027551015218098957, "learning_rate": 0.0001, "loss": 4.0884, "loss/crossentropy": 1.91280996799469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17027543485164642, "step": 20466 }, { "epoch": 0.40936, "grad_norm": 1.8671875, "grad_norm_var": 0.028831990559895833, "learning_rate": 0.0001, "loss": 3.9003, "loss/crossentropy": 2.024249494075775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20218047499656677, "step": 20468 }, { "epoch": 0.4094, "grad_norm": 2.28125, "grad_norm_var": 0.03514989217122396, "learning_rate": 0.0001, "loss": 4.0608, "loss/crossentropy": 1.787394940853119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18133901804685593, "step": 20470 }, { "epoch": 0.40944, "grad_norm": 1.8203125, "grad_norm_var": 0.02319920857747396, "learning_rate": 0.0001, "loss": 3.9609, "loss/crossentropy": 2.085790276527405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18643641471862793, "step": 20472 }, { "epoch": 0.40948, "grad_norm": 1.953125, "grad_norm_var": 0.017929840087890624, "learning_rate": 0.0001, "loss": 4.2251, "loss/crossentropy": 2.1670111417770386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19420959055423737, "step": 20474 }, { "epoch": 0.40952, "grad_norm": 1.875, "grad_norm_var": 0.01706720987955729, "learning_rate": 0.0001, "loss": 4.0588, "loss/crossentropy": 2.0304930210113525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18586499989032745, "step": 20476 }, { "epoch": 0.40956, "grad_norm": 2.0625, "grad_norm_var": 0.017832183837890626, "learning_rate": 0.0001, "loss": 4.2371, "loss/crossentropy": 2.0918440222740173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831007301807404, "step": 20478 }, { "epoch": 0.4096, "grad_norm": 1.8671875, "grad_norm_var": 0.017765045166015625, "learning_rate": 0.0001, "loss": 3.8786, "loss/crossentropy": 2.027481257915497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19717323035001755, "step": 20480 }, { "epoch": 0.40964, "grad_norm": 1.890625, "grad_norm_var": 0.017333984375, "learning_rate": 0.0001, "loss": 4.3524, "loss/crossentropy": 2.227196455001831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21321603655815125, "step": 20482 }, { "epoch": 0.40968, "grad_norm": 2.078125, "grad_norm_var": 0.015329742431640625, "learning_rate": 0.0001, "loss": 3.9722, "loss/crossentropy": 1.8985567688941956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19271692633628845, "step": 20484 }, { "epoch": 0.40972, "grad_norm": 2.203125, "grad_norm_var": 0.011498006184895833, "learning_rate": 0.0001, "loss": 4.1364, "loss/crossentropy": 1.973772943019867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18885072320699692, "step": 20486 }, { "epoch": 0.40976, "grad_norm": 1.8828125, "grad_norm_var": 0.01171875, "learning_rate": 0.0001, "loss": 3.8451, "loss/crossentropy": 2.002311408519745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053375542163849, "step": 20488 }, { "epoch": 0.4098, "grad_norm": 2.09375, "grad_norm_var": 0.0702301025390625, "learning_rate": 0.0001, "loss": 3.8449, "loss/crossentropy": 1.7301526069641113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16717776656150818, "step": 20490 }, { "epoch": 0.40984, "grad_norm": 2.015625, "grad_norm_var": 0.0685546875, "learning_rate": 0.0001, "loss": 4.1871, "loss/crossentropy": 1.86936616897583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944454312324524, "step": 20492 }, { "epoch": 0.40988, "grad_norm": 1.9765625, "grad_norm_var": 0.06875381469726563, "learning_rate": 0.0001, "loss": 4.1013, "loss/crossentropy": 1.5966055393218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19996708631515503, "step": 20494 }, { "epoch": 0.40992, "grad_norm": 2.125, "grad_norm_var": 0.0671783447265625, "learning_rate": 0.0001, "loss": 3.9716, "loss/crossentropy": 1.8741289377212524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18652669340372086, "step": 20496 }, { "epoch": 0.40996, "grad_norm": 1.921875, "grad_norm_var": 0.06702372233072916, "learning_rate": 0.0001, "loss": 4.0857, "loss/crossentropy": 1.958758294582367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19158867746591568, "step": 20498 }, { "epoch": 0.41, "grad_norm": 1.953125, "grad_norm_var": 0.06591771443684896, "learning_rate": 0.0001, "loss": 3.7938, "loss/crossentropy": 1.8465643525123596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1774417608976364, "step": 20500 }, { "epoch": 0.41004, "grad_norm": 2.015625, "grad_norm_var": 0.06303609212239583, "learning_rate": 0.0001, "loss": 4.0265, "loss/crossentropy": 1.9578893780708313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19635465741157532, "step": 20502 }, { "epoch": 0.41008, "grad_norm": 1.875, "grad_norm_var": 0.06118977864583333, "learning_rate": 0.0001, "loss": 3.7317, "loss/crossentropy": 1.8893834352493286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1778636872768402, "step": 20504 }, { "epoch": 0.41012, "grad_norm": 1.8125, "grad_norm_var": 0.011189778645833334, "learning_rate": 0.0001, "loss": 4.1129, "loss/crossentropy": 2.2161590456962585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19613313674926758, "step": 20506 }, { "epoch": 0.41016, "grad_norm": 1.9453125, "grad_norm_var": 0.011368560791015624, "learning_rate": 0.0001, "loss": 4.2363, "loss/crossentropy": 2.1272462606430054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20562685281038284, "step": 20508 }, { "epoch": 0.4102, "grad_norm": 1.9921875, "grad_norm_var": 0.011319732666015625, "learning_rate": 0.0001, "loss": 3.9087, "loss/crossentropy": 1.8443754315376282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20733944326639175, "step": 20510 }, { "epoch": 0.41024, "grad_norm": 1.8984375, "grad_norm_var": 0.012007649739583333, "learning_rate": 0.0001, "loss": 3.8766, "loss/crossentropy": 1.8213927149772644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17855563014745712, "step": 20512 }, { "epoch": 0.41028, "grad_norm": 2.03125, "grad_norm_var": 0.009787750244140626, "learning_rate": 0.0001, "loss": 4.0914, "loss/crossentropy": 1.969380497932434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20201627910137177, "step": 20514 }, { "epoch": 0.41032, "grad_norm": 1.9765625, "grad_norm_var": 0.009479777018229166, "learning_rate": 0.0001, "loss": 4.0491, "loss/crossentropy": 1.8974932432174683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20217570662498474, "step": 20516 }, { "epoch": 0.41036, "grad_norm": 1.8359375, "grad_norm_var": 0.010312652587890625, "learning_rate": 0.0001, "loss": 4.0019, "loss/crossentropy": 1.9362964630126953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1832883059978485, "step": 20518 }, { "epoch": 0.4104, "grad_norm": 2.078125, "grad_norm_var": 0.010754140218098958, "learning_rate": 0.0001, "loss": 4.2071, "loss/crossentropy": 2.239969849586487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22399117052555084, "step": 20520 }, { "epoch": 0.41044, "grad_norm": 1.9921875, "grad_norm_var": 0.004987589518229167, "learning_rate": 0.0001, "loss": 3.9606, "loss/crossentropy": 1.927116334438324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.194569431245327, "step": 20522 }, { "epoch": 0.41048, "grad_norm": 1.9609375, "grad_norm_var": 0.00474853515625, "learning_rate": 0.0001, "loss": 4.0993, "loss/crossentropy": 1.921772539615631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19974206387996674, "step": 20524 }, { "epoch": 0.41052, "grad_norm": 2.046875, "grad_norm_var": 0.00509033203125, "learning_rate": 0.0001, "loss": 3.9591, "loss/crossentropy": 2.2263039350509644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088855803012848, "step": 20526 }, { "epoch": 0.41056, "grad_norm": 1.90625, "grad_norm_var": 0.00343017578125, "learning_rate": 0.0001, "loss": 3.9004, "loss/crossentropy": 1.9309766292572021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984400451183319, "step": 20528 }, { "epoch": 0.4106, "grad_norm": 1.9453125, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 4.3564, "loss/crossentropy": 2.2890199422836304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210438072681427, "step": 20530 }, { "epoch": 0.41064, "grad_norm": 1.9921875, "grad_norm_var": 0.007199859619140625, "learning_rate": 0.0001, "loss": 4.2302, "loss/crossentropy": 1.961566150188446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20430031418800354, "step": 20532 }, { "epoch": 0.41068, "grad_norm": 2.03125, "grad_norm_var": 0.0051513671875, "learning_rate": 0.0001, "loss": 4.0714, "loss/crossentropy": 1.9776958227157593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20376411080360413, "step": 20534 }, { "epoch": 0.41072, "grad_norm": 1.9140625, "grad_norm_var": 0.005417633056640625, "learning_rate": 0.0001, "loss": 3.9352, "loss/crossentropy": 2.386608600616455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20900936424732208, "step": 20536 }, { "epoch": 0.41076, "grad_norm": 1.828125, "grad_norm_var": 0.007233683268229167, "learning_rate": 0.0001, "loss": 3.8075, "loss/crossentropy": 2.0592793226242065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19810590893030167, "step": 20538 }, { "epoch": 0.4108, "grad_norm": 1.9765625, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 4.2639, "loss/crossentropy": 2.176212787628174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20612400770187378, "step": 20540 }, { "epoch": 0.41084, "grad_norm": 1.96875, "grad_norm_var": 0.007134755452473958, "learning_rate": 0.0001, "loss": 4.0386, "loss/crossentropy": 1.920596957206726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19120831042528152, "step": 20542 }, { "epoch": 0.41088, "grad_norm": 1.8046875, "grad_norm_var": 0.0086578369140625, "learning_rate": 0.0001, "loss": 4.0088, "loss/crossentropy": 1.8587758541107178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1729830950498581, "step": 20544 }, { "epoch": 0.41092, "grad_norm": 1.96875, "grad_norm_var": 0.004538726806640625, "learning_rate": 0.0001, "loss": 4.1263, "loss/crossentropy": 2.115676999092102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130855768918991, "step": 20546 }, { "epoch": 0.41096, "grad_norm": 1.9609375, "grad_norm_var": 0.0043332417805989586, "learning_rate": 0.0001, "loss": 4.1933, "loss/crossentropy": 2.3977283239364624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074846476316452, "step": 20548 }, { "epoch": 0.411, "grad_norm": 1.7421875, "grad_norm_var": 0.005700429280598958, "learning_rate": 0.0001, "loss": 3.9028, "loss/crossentropy": 2.091398298740387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975005343556404, "step": 20550 }, { "epoch": 0.41104, "grad_norm": 1.9140625, "grad_norm_var": 0.005861155192057292, "learning_rate": 0.0001, "loss": 4.1509, "loss/crossentropy": 2.1864093542099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201247364282608, "step": 20552 }, { "epoch": 0.41108, "grad_norm": 1.8984375, "grad_norm_var": 0.005206044514973958, "learning_rate": 0.0001, "loss": 3.8462, "loss/crossentropy": 1.9154713153839111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18139449506998062, "step": 20554 }, { "epoch": 0.41112, "grad_norm": 1.9296875, "grad_norm_var": 0.005682118733723958, "learning_rate": 0.0001, "loss": 4.0527, "loss/crossentropy": 2.392449378967285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069767862558365, "step": 20556 }, { "epoch": 0.41116, "grad_norm": 2.09375, "grad_norm_var": 0.0071604410807291664, "learning_rate": 0.0001, "loss": 4.0525, "loss/crossentropy": 1.9300146102905273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20329315215349197, "step": 20558 }, { "epoch": 0.4112, "grad_norm": 1.8359375, "grad_norm_var": 0.007313791910807292, "learning_rate": 0.0001, "loss": 3.6521, "loss/crossentropy": 1.9400085806846619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1855112686753273, "step": 20560 }, { "epoch": 0.41124, "grad_norm": 1.9765625, "grad_norm_var": 0.008321126302083334, "learning_rate": 0.0001, "loss": 4.1355, "loss/crossentropy": 2.1216121912002563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1966063678264618, "step": 20562 }, { "epoch": 0.41128, "grad_norm": 1.9453125, "grad_norm_var": 0.008154042561848958, "learning_rate": 0.0001, "loss": 3.9513, "loss/crossentropy": 2.136322498321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19735725224018097, "step": 20564 }, { "epoch": 0.41132, "grad_norm": 1.9453125, "grad_norm_var": 0.0060943603515625, "learning_rate": 0.0001, "loss": 3.9084, "loss/crossentropy": 2.1143118143081665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21534033119678497, "step": 20566 }, { "epoch": 0.41136, "grad_norm": 2.046875, "grad_norm_var": 0.006577301025390625, "learning_rate": 0.0001, "loss": 3.9451, "loss/crossentropy": 1.8363113403320312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17482534050941467, "step": 20568 }, { "epoch": 0.4114, "grad_norm": 1.9453125, "grad_norm_var": 0.00645751953125, "learning_rate": 0.0001, "loss": 4.0373, "loss/crossentropy": 1.8840075135231018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19201497733592987, "step": 20570 }, { "epoch": 0.41144, "grad_norm": 2.0, "grad_norm_var": 0.006270090738932292, "learning_rate": 0.0001, "loss": 4.0909, "loss/crossentropy": 2.0200312733650208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130940854549408, "step": 20572 }, { "epoch": 0.41148, "grad_norm": 1.9296875, "grad_norm_var": 0.005078125, "learning_rate": 0.0001, "loss": 3.8886, "loss/crossentropy": 2.136338949203491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22277235239744186, "step": 20574 }, { "epoch": 0.41152, "grad_norm": 2.078125, "grad_norm_var": 0.004168446858723958, "learning_rate": 0.0001, "loss": 4.2314, "loss/crossentropy": 2.2381343841552734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20784687995910645, "step": 20576 }, { "epoch": 0.41156, "grad_norm": 2.0, "grad_norm_var": 0.003780110677083333, "learning_rate": 0.0001, "loss": 4.0749, "loss/crossentropy": 2.224187970161438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20546535402536392, "step": 20578 }, { "epoch": 0.4116, "grad_norm": 1.875, "grad_norm_var": 0.004117838541666667, "learning_rate": 0.0001, "loss": 3.5968, "loss/crossentropy": 1.966045081615448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19925507158041, "step": 20580 }, { "epoch": 0.41164, "grad_norm": 1.984375, "grad_norm_var": 0.0036041259765625, "learning_rate": 0.0001, "loss": 3.6756, "loss/crossentropy": 1.7995998859405518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1794610470533371, "step": 20582 }, { "epoch": 0.41168, "grad_norm": 2.3125, "grad_norm_var": 0.011474609375, "learning_rate": 0.0001, "loss": 4.2112, "loss/crossentropy": 1.874330759048462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23802601546049118, "step": 20584 }, { "epoch": 0.41172, "grad_norm": 1.9921875, "grad_norm_var": 0.011582183837890624, "learning_rate": 0.0001, "loss": 3.9275, "loss/crossentropy": 2.1704328060150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792975068092346, "step": 20586 }, { "epoch": 0.41176, "grad_norm": 2.046875, "grad_norm_var": 0.012648264567057291, "learning_rate": 0.0001, "loss": 4.0073, "loss/crossentropy": 1.9361745119094849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19227440655231476, "step": 20588 }, { "epoch": 0.4118, "grad_norm": 2.0625, "grad_norm_var": 0.0129058837890625, "learning_rate": 0.0001, "loss": 4.297, "loss/crossentropy": 2.19529128074646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23097511380910873, "step": 20590 }, { "epoch": 0.41184, "grad_norm": 1.9609375, "grad_norm_var": 0.012621815999348958, "learning_rate": 0.0001, "loss": 4.1085, "loss/crossentropy": 2.0319225788116455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555176049470901, "step": 20592 }, { "epoch": 0.41188, "grad_norm": 1.890625, "grad_norm_var": 0.012393951416015625, "learning_rate": 0.0001, "loss": 4.0767, "loss/crossentropy": 2.395743250846863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20953691750764847, "step": 20594 }, { "epoch": 0.41192, "grad_norm": 1.796875, "grad_norm_var": 0.013901519775390624, "learning_rate": 0.0001, "loss": 3.8942, "loss/crossentropy": 1.7772584557533264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19968461245298386, "step": 20596 }, { "epoch": 0.41196, "grad_norm": 1.90625, "grad_norm_var": 0.0158843994140625, "learning_rate": 0.0001, "loss": 3.7776, "loss/crossentropy": 1.89765065908432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18085069954395294, "step": 20598 }, { "epoch": 0.412, "grad_norm": 1.796875, "grad_norm_var": 0.007846832275390625, "learning_rate": 0.0001, "loss": 3.9754, "loss/crossentropy": 2.1154285073280334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20126362890005112, "step": 20600 }, { "epoch": 0.41204, "grad_norm": 1.9609375, "grad_norm_var": 0.008259073893229166, "learning_rate": 0.0001, "loss": 4.2659, "loss/crossentropy": 2.15169358253479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20982082933187485, "step": 20602 }, { "epoch": 0.41208, "grad_norm": 1.8984375, "grad_norm_var": 0.007824452718098958, "learning_rate": 0.0001, "loss": 3.9542, "loss/crossentropy": 1.8561811447143555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18480198085308075, "step": 20604 }, { "epoch": 0.41212, "grad_norm": 1.96875, "grad_norm_var": 0.008766428629557291, "learning_rate": 0.0001, "loss": 3.8989, "loss/crossentropy": 1.8246020078659058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18010297417640686, "step": 20606 }, { "epoch": 0.41216, "grad_norm": 2.015625, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 2.3279634714126587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224753275513649, "step": 20608 }, { "epoch": 0.4122, "grad_norm": 2.0, "grad_norm_var": 0.009171295166015624, "learning_rate": 0.0001, "loss": 4.0695, "loss/crossentropy": 2.122196078300476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2060379832983017, "step": 20610 }, { "epoch": 0.41224, "grad_norm": 1.8515625, "grad_norm_var": 0.009666951497395833, "learning_rate": 0.0001, "loss": 4.1282, "loss/crossentropy": 2.0729124546051025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20537251234054565, "step": 20612 }, { "epoch": 0.41228, "grad_norm": 1.9140625, "grad_norm_var": 0.008443196614583334, "learning_rate": 0.0001, "loss": 4.2078, "loss/crossentropy": 2.1832374930381775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19204121828079224, "step": 20614 }, { "epoch": 0.41232, "grad_norm": 1.9140625, "grad_norm_var": 0.007982381184895833, "learning_rate": 0.0001, "loss": 3.8745, "loss/crossentropy": 1.6739779114723206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15309840440750122, "step": 20616 }, { "epoch": 0.41236, "grad_norm": 1.8046875, "grad_norm_var": 0.008754221598307292, "learning_rate": 0.0001, "loss": 4.1438, "loss/crossentropy": 2.1585883498191833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20271800458431244, "step": 20618 }, { "epoch": 0.4124, "grad_norm": 1.9609375, "grad_norm_var": 0.008235422770182292, "learning_rate": 0.0001, "loss": 4.4289, "loss/crossentropy": 2.391141653060913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2225441113114357, "step": 20620 }, { "epoch": 0.41244, "grad_norm": 1.9140625, "grad_norm_var": 0.0053619384765625, "learning_rate": 0.0001, "loss": 4.1355, "loss/crossentropy": 2.055308997631073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839761957526207, "step": 20622 }, { "epoch": 0.41248, "grad_norm": 2.015625, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 4.1143, "loss/crossentropy": 2.1222954988479614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18773587048053741, "step": 20624 }, { "epoch": 0.41252, "grad_norm": 2.0625, "grad_norm_var": 0.005647532145182292, "learning_rate": 0.0001, "loss": 4.1506, "loss/crossentropy": 2.011850595474243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19865088164806366, "step": 20626 }, { "epoch": 0.41256, "grad_norm": 1.9453125, "grad_norm_var": 0.0039784749348958336, "learning_rate": 0.0001, "loss": 4.0122, "loss/crossentropy": 1.841840922832489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18899217247962952, "step": 20628 }, { "epoch": 0.4126, "grad_norm": 1.953125, "grad_norm_var": 0.004313151041666667, "learning_rate": 0.0001, "loss": 4.2477, "loss/crossentropy": 2.335289478302002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22496683150529861, "step": 20630 }, { "epoch": 0.41264, "grad_norm": 1.8515625, "grad_norm_var": 0.004487864176432292, "learning_rate": 0.0001, "loss": 3.9252, "loss/crossentropy": 2.0089204907417297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18921320140361786, "step": 20632 }, { "epoch": 0.41268, "grad_norm": 2.0625, "grad_norm_var": 0.004146067301432291, "learning_rate": 0.0001, "loss": 4.052, "loss/crossentropy": 2.115709662437439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033844143152237, "step": 20634 }, { "epoch": 0.41272, "grad_norm": 1.921875, "grad_norm_var": 0.004282379150390625, "learning_rate": 0.0001, "loss": 4.0311, "loss/crossentropy": 2.175198018550873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20709756761789322, "step": 20636 }, { "epoch": 0.41276, "grad_norm": 1.8515625, "grad_norm_var": 0.005248006184895833, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 2.128443717956543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18793444335460663, "step": 20638 }, { "epoch": 0.4128, "grad_norm": 1.90625, "grad_norm_var": 0.005830891927083333, "learning_rate": 0.0001, "loss": 3.9456, "loss/crossentropy": 1.890213668346405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17856723070144653, "step": 20640 }, { "epoch": 0.41284, "grad_norm": 2.125, "grad_norm_var": 0.008283487955729167, "learning_rate": 0.0001, "loss": 4.1055, "loss/crossentropy": 1.925938606262207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19046395272016525, "step": 20642 }, { "epoch": 0.41288, "grad_norm": 1.9453125, "grad_norm_var": 0.008442942301432292, "learning_rate": 0.0001, "loss": 4.2369, "loss/crossentropy": 2.0149936079978943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20876548439264297, "step": 20644 }, { "epoch": 0.41292, "grad_norm": 1.9140625, "grad_norm_var": 0.008174641927083334, "learning_rate": 0.0001, "loss": 4.1904, "loss/crossentropy": 2.237368941307068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20562848448753357, "step": 20646 }, { "epoch": 0.41296, "grad_norm": 1.8828125, "grad_norm_var": 0.008451080322265625, "learning_rate": 0.0001, "loss": 3.8332, "loss/crossentropy": 2.0298978090286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20426444709300995, "step": 20648 }, { "epoch": 0.413, "grad_norm": 2.09375, "grad_norm_var": 0.0090240478515625, "learning_rate": 0.0001, "loss": 3.872, "loss/crossentropy": 1.8883816599845886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18203142285346985, "step": 20650 }, { "epoch": 0.41304, "grad_norm": 2.25, "grad_norm_var": 0.014288075764973958, "learning_rate": 0.0001, "loss": 4.0831, "loss/crossentropy": 2.051727771759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19955651462078094, "step": 20652 }, { "epoch": 0.41308, "grad_norm": 2.015625, "grad_norm_var": 0.013335927327473959, "learning_rate": 0.0001, "loss": 4.1298, "loss/crossentropy": 2.3192938566207886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23360159993171692, "step": 20654 }, { "epoch": 0.41312, "grad_norm": 1.859375, "grad_norm_var": 0.014647420247395833, "learning_rate": 0.0001, "loss": 3.8486, "loss/crossentropy": 2.2148059606552124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20121797919273376, "step": 20656 }, { "epoch": 0.41316, "grad_norm": 2.015625, "grad_norm_var": 0.011787923177083333, "learning_rate": 0.0001, "loss": 4.2523, "loss/crossentropy": 2.0665117502212524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18279288709163666, "step": 20658 }, { "epoch": 0.4132, "grad_norm": 1.9140625, "grad_norm_var": 0.013060506184895833, "learning_rate": 0.0001, "loss": 3.8782, "loss/crossentropy": 2.0858163833618164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19623829424381256, "step": 20660 }, { "epoch": 0.41324, "grad_norm": 1.875, "grad_norm_var": 0.013630167643229166, "learning_rate": 0.0001, "loss": 3.8134, "loss/crossentropy": 2.238997220993042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20264007151126862, "step": 20662 }, { "epoch": 0.41328, "grad_norm": 2.0625, "grad_norm_var": 0.016605377197265625, "learning_rate": 0.0001, "loss": 4.2317, "loss/crossentropy": 2.2540037631988525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21315360069274902, "step": 20664 }, { "epoch": 0.41332, "grad_norm": 2.03125, "grad_norm_var": 0.014975738525390626, "learning_rate": 0.0001, "loss": 4.1657, "loss/crossentropy": 2.3957645893096924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319154515862465, "step": 20666 }, { "epoch": 0.41336, "grad_norm": 2.015625, "grad_norm_var": 0.009527333577473958, "learning_rate": 0.0001, "loss": 4.1946, "loss/crossentropy": 2.046703338623047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992144137620926, "step": 20668 }, { "epoch": 0.4134, "grad_norm": 1.9375, "grad_norm_var": 0.010677083333333334, "learning_rate": 0.0001, "loss": 3.7664, "loss/crossentropy": 1.7997825145721436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1673276051878929, "step": 20670 }, { "epoch": 0.41344, "grad_norm": 2.03125, "grad_norm_var": 0.009069569905598958, "learning_rate": 0.0001, "loss": 3.9432, "loss/crossentropy": 2.102105975151062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19461503624916077, "step": 20672 }, { "epoch": 0.41348, "grad_norm": 1.8984375, "grad_norm_var": 0.009557851155598958, "learning_rate": 0.0001, "loss": 3.8547, "loss/crossentropy": 1.7070594429969788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17999255657196045, "step": 20674 }, { "epoch": 0.41352, "grad_norm": 1.890625, "grad_norm_var": 0.00869140625, "learning_rate": 0.0001, "loss": 4.0361, "loss/crossentropy": 2.1908987760543823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010250836610794, "step": 20676 }, { "epoch": 0.41356, "grad_norm": 2.046875, "grad_norm_var": 0.007582346598307292, "learning_rate": 0.0001, "loss": 4.2711, "loss/crossentropy": 2.1943222284317017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21789705008268356, "step": 20678 }, { "epoch": 0.4136, "grad_norm": 1.9609375, "grad_norm_var": 0.003842926025390625, "learning_rate": 0.0001, "loss": 4.2444, "loss/crossentropy": 2.1216484904289246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20987947285175323, "step": 20680 }, { "epoch": 0.41364, "grad_norm": 1.9140625, "grad_norm_var": 0.0038889567057291668, "learning_rate": 0.0001, "loss": 4.0256, "loss/crossentropy": 2.3570865392684937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23686359077692032, "step": 20682 }, { "epoch": 0.41368, "grad_norm": 1.953125, "grad_norm_var": 0.0040771484375, "learning_rate": 0.0001, "loss": 4.1448, "loss/crossentropy": 1.834035336971283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17401690781116486, "step": 20684 }, { "epoch": 0.41372, "grad_norm": 2.109375, "grad_norm_var": 0.0040891011555989586, "learning_rate": 0.0001, "loss": 4.2049, "loss/crossentropy": 2.251484513282776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22543959319591522, "step": 20686 }, { "epoch": 0.41376, "grad_norm": 1.9375, "grad_norm_var": 0.004571278889973958, "learning_rate": 0.0001, "loss": 4.3775, "loss/crossentropy": 2.233001708984375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18968196213245392, "step": 20688 }, { "epoch": 0.4138, "grad_norm": 1.96875, "grad_norm_var": 0.004198964436848958, "learning_rate": 0.0001, "loss": 3.9482, "loss/crossentropy": 1.6730775833129883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1740739718079567, "step": 20690 }, { "epoch": 0.41384, "grad_norm": 1.8515625, "grad_norm_var": 0.0100341796875, "learning_rate": 0.0001, "loss": 3.7717, "loss/crossentropy": 2.3700443506240845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20850200206041336, "step": 20692 }, { "epoch": 0.41388, "grad_norm": 2.0, "grad_norm_var": 0.0098541259765625, "learning_rate": 0.0001, "loss": 4.2457, "loss/crossentropy": 2.253411650657654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19234959036111832, "step": 20694 }, { "epoch": 0.41392, "grad_norm": 1.9296875, "grad_norm_var": 0.011836751302083334, "learning_rate": 0.0001, "loss": 3.9595, "loss/crossentropy": 2.414412260055542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20320549607276917, "step": 20696 }, { "epoch": 0.41396, "grad_norm": 1.9296875, "grad_norm_var": 0.012562815348307292, "learning_rate": 0.0001, "loss": 3.8084, "loss/crossentropy": 1.8252301216125488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19211787730455399, "step": 20698 }, { "epoch": 0.414, "grad_norm": 2.015625, "grad_norm_var": 0.012387847900390625, "learning_rate": 0.0001, "loss": 4.123, "loss/crossentropy": 2.210301160812378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21196797490119934, "step": 20700 }, { "epoch": 0.41404, "grad_norm": 2.15625, "grad_norm_var": 0.013480377197265626, "learning_rate": 0.0001, "loss": 4.0629, "loss/crossentropy": 1.6429123878479004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15947452187538147, "step": 20702 }, { "epoch": 0.41408, "grad_norm": 1.84375, "grad_norm_var": 0.014218902587890625, "learning_rate": 0.0001, "loss": 3.9954, "loss/crossentropy": 1.8459800481796265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18668671697378159, "step": 20704 }, { "epoch": 0.41412, "grad_norm": 1.921875, "grad_norm_var": 0.014147694905598958, "learning_rate": 0.0001, "loss": 3.9468, "loss/crossentropy": 2.20779287815094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034989595413208, "step": 20706 }, { "epoch": 0.41416, "grad_norm": 1.9140625, "grad_norm_var": 0.03316218058268229, "learning_rate": 0.0001, "loss": 4.0503, "loss/crossentropy": 1.9622939229011536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19208793342113495, "step": 20708 }, { "epoch": 0.4142, "grad_norm": 1.9140625, "grad_norm_var": 0.032990519205729166, "learning_rate": 0.0001, "loss": 4.2467, "loss/crossentropy": 1.9145439267158508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17929863929748535, "step": 20710 }, { "epoch": 0.41424, "grad_norm": 1.6953125, "grad_norm_var": 0.03706029256184896, "learning_rate": 0.0001, "loss": 3.9202, "loss/crossentropy": 1.9035375714302063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21188101917505264, "step": 20712 }, { "epoch": 0.41428, "grad_norm": 1.8671875, "grad_norm_var": 0.0380767822265625, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 2.0500373244285583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2162824273109436, "step": 20714 }, { "epoch": 0.41432, "grad_norm": 2.0625, "grad_norm_var": 0.03862075805664063, "learning_rate": 0.0001, "loss": 4.1101, "loss/crossentropy": 1.9774840474128723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20070379972457886, "step": 20716 }, { "epoch": 0.41436, "grad_norm": 2.125, "grad_norm_var": 0.03785985310872396, "learning_rate": 0.0001, "loss": 4.3021, "loss/crossentropy": 1.9861173629760742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19038750231266022, "step": 20718 }, { "epoch": 0.4144, "grad_norm": 1.875, "grad_norm_var": 0.053098297119140624, "learning_rate": 0.0001, "loss": 3.669, "loss/crossentropy": 1.9744009375572205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19013763219118118, "step": 20720 }, { "epoch": 0.41444, "grad_norm": 1.8515625, "grad_norm_var": 0.054441070556640624, "learning_rate": 0.0001, "loss": 3.9343, "loss/crossentropy": 1.7110218405723572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1715855970978737, "step": 20722 }, { "epoch": 0.41448, "grad_norm": 2.015625, "grad_norm_var": 0.031556955973307294, "learning_rate": 0.0001, "loss": 3.9484, "loss/crossentropy": 1.8336329460144043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19409292936325073, "step": 20724 }, { "epoch": 0.41452, "grad_norm": 1.9765625, "grad_norm_var": 0.03246027628580729, "learning_rate": 0.0001, "loss": 3.9218, "loss/crossentropy": 1.8363453149795532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16922436654567719, "step": 20726 }, { "epoch": 0.41456, "grad_norm": 2.09375, "grad_norm_var": 0.025699869791666666, "learning_rate": 0.0001, "loss": 4.1121, "loss/crossentropy": 1.8911715745925903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19849538803100586, "step": 20728 }, { "epoch": 0.4146, "grad_norm": 1.8359375, "grad_norm_var": 0.02716064453125, "learning_rate": 0.0001, "loss": 3.8365, "loss/crossentropy": 1.7989731431007385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18458950519561768, "step": 20730 }, { "epoch": 0.41464, "grad_norm": 1.921875, "grad_norm_var": 0.02692235310872396, "learning_rate": 0.0001, "loss": 3.9896, "loss/crossentropy": 1.9423270225524902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18612459301948547, "step": 20732 }, { "epoch": 0.41468, "grad_norm": 1.9921875, "grad_norm_var": 0.06059137980143229, "learning_rate": 0.0001, "loss": 3.7741, "loss/crossentropy": 1.7587624788284302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1545337438583374, "step": 20734 }, { "epoch": 0.41472, "grad_norm": 1.8984375, "grad_norm_var": 0.04381510416666667, "learning_rate": 0.0001, "loss": 3.9651, "loss/crossentropy": 2.314169406890869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22612107545137405, "step": 20736 }, { "epoch": 0.41476, "grad_norm": 2.421875, "grad_norm_var": 0.0535308837890625, "learning_rate": 0.0001, "loss": 4.272, "loss/crossentropy": 2.088030219078064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19648675620555878, "step": 20738 }, { "epoch": 0.4148, "grad_norm": 2.03125, "grad_norm_var": 0.05324071248372396, "learning_rate": 0.0001, "loss": 3.8286, "loss/crossentropy": 1.6665831208229065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17022767663002014, "step": 20740 }, { "epoch": 0.41484, "grad_norm": 1.9765625, "grad_norm_var": 0.05115458170572917, "learning_rate": 0.0001, "loss": 4.1142, "loss/crossentropy": 2.425256609916687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20156945288181305, "step": 20742 }, { "epoch": 0.41488, "grad_norm": 2.0, "grad_norm_var": 0.05110244750976563, "learning_rate": 0.0001, "loss": 3.8343, "loss/crossentropy": 1.6360740661621094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16550948470830917, "step": 20744 }, { "epoch": 0.41492, "grad_norm": 2.078125, "grad_norm_var": 0.044960276285807295, "learning_rate": 0.0001, "loss": 4.0782, "loss/crossentropy": 2.0676532983779907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137626469135284, "step": 20746 }, { "epoch": 0.41496, "grad_norm": 1.9453125, "grad_norm_var": 0.04678141276041667, "learning_rate": 0.0001, "loss": 4.122, "loss/crossentropy": 2.059161067008972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20899715274572372, "step": 20748 }, { "epoch": 0.415, "grad_norm": 1.9296875, "grad_norm_var": 0.017268625895182292, "learning_rate": 0.0001, "loss": 4.1438, "loss/crossentropy": 2.1778156757354736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20495261996984482, "step": 20750 }, { "epoch": 0.41504, "grad_norm": 2.125, "grad_norm_var": 0.016877237955729166, "learning_rate": 0.0001, "loss": 4.2809, "loss/crossentropy": 2.368219017982483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21667053550481796, "step": 20752 }, { "epoch": 0.41508, "grad_norm": 1.828125, "grad_norm_var": 0.006703440348307292, "learning_rate": 0.0001, "loss": 3.7872, "loss/crossentropy": 1.9143288731575012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1782715767621994, "step": 20754 }, { "epoch": 0.41512, "grad_norm": 2.1875, "grad_norm_var": 0.009388987223307292, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 1.7776128649711609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1746697723865509, "step": 20756 }, { "epoch": 0.41516, "grad_norm": 1.90625, "grad_norm_var": 0.009824371337890625, "learning_rate": 0.0001, "loss": 3.9991, "loss/crossentropy": 2.057144343852997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18555855751037598, "step": 20758 }, { "epoch": 0.4152, "grad_norm": 1.9453125, "grad_norm_var": 0.012048085530598959, "learning_rate": 0.0001, "loss": 4.0996, "loss/crossentropy": 1.9628003239631653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2057315558195114, "step": 20760 }, { "epoch": 0.41524, "grad_norm": 2.109375, "grad_norm_var": 0.01282958984375, "learning_rate": 0.0001, "loss": 3.9163, "loss/crossentropy": 1.8570061326026917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19530560076236725, "step": 20762 }, { "epoch": 0.41528, "grad_norm": 2.015625, "grad_norm_var": 0.0115386962890625, "learning_rate": 0.0001, "loss": 3.9201, "loss/crossentropy": 1.939897060394287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979508325457573, "step": 20764 }, { "epoch": 0.41532, "grad_norm": 1.921875, "grad_norm_var": 0.0116607666015625, "learning_rate": 0.0001, "loss": 4.0702, "loss/crossentropy": 2.15025132894516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082752287387848, "step": 20766 }, { "epoch": 0.41536, "grad_norm": 1.9765625, "grad_norm_var": 0.010713704427083333, "learning_rate": 0.0001, "loss": 4.0037, "loss/crossentropy": 2.202960252761841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20040277391672134, "step": 20768 }, { "epoch": 0.4154, "grad_norm": 1.96875, "grad_norm_var": 0.008208974202473959, "learning_rate": 0.0001, "loss": 4.05, "loss/crossentropy": 2.107069969177246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995261162519455, "step": 20770 }, { "epoch": 0.41544, "grad_norm": 1.84375, "grad_norm_var": 0.0071103413899739586, "learning_rate": 0.0001, "loss": 3.8939, "loss/crossentropy": 2.287596106529236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20532196760177612, "step": 20772 }, { "epoch": 0.41548, "grad_norm": 2.0625, "grad_norm_var": 0.013236236572265626, "learning_rate": 0.0001, "loss": 3.7519, "loss/crossentropy": 1.8234007954597473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17727909982204437, "step": 20774 }, { "epoch": 0.41552, "grad_norm": 1.9453125, "grad_norm_var": 0.010581207275390626, "learning_rate": 0.0001, "loss": 3.8238, "loss/crossentropy": 1.9223560690879822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918986812233925, "step": 20776 }, { "epoch": 0.41556, "grad_norm": 1.8671875, "grad_norm_var": 0.009769694010416666, "learning_rate": 0.0001, "loss": 3.8487, "loss/crossentropy": 2.0554555654525757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17795050144195557, "step": 20778 }, { "epoch": 0.4156, "grad_norm": 1.9765625, "grad_norm_var": 0.009427642822265625, "learning_rate": 0.0001, "loss": 3.9899, "loss/crossentropy": 2.170193314552307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20256754010915756, "step": 20780 }, { "epoch": 0.41564, "grad_norm": 2.0, "grad_norm_var": 0.009732818603515625, "learning_rate": 0.0001, "loss": 3.8443, "loss/crossentropy": 1.704530119895935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18020445108413696, "step": 20782 }, { "epoch": 0.41568, "grad_norm": 1.8671875, "grad_norm_var": 0.009915924072265625, "learning_rate": 0.0001, "loss": 4.0143, "loss/crossentropy": 2.1245445013046265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20647235959768295, "step": 20784 }, { "epoch": 0.41572, "grad_norm": 1.890625, "grad_norm_var": 0.009895579020182291, "learning_rate": 0.0001, "loss": 4.1806, "loss/crossentropy": 2.4078409671783447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126438021659851, "step": 20786 }, { "epoch": 0.41576, "grad_norm": 1.84375, "grad_norm_var": 0.011732737223307291, "learning_rate": 0.0001, "loss": 4.1084, "loss/crossentropy": 2.0577614307403564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19302819669246674, "step": 20788 }, { "epoch": 0.4158, "grad_norm": 1.9921875, "grad_norm_var": 0.0061686197916666664, "learning_rate": 0.0001, "loss": 4.0234, "loss/crossentropy": 2.0301398038864136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947554647922516, "step": 20790 }, { "epoch": 0.41584, "grad_norm": 1.8984375, "grad_norm_var": 0.0062558492024739586, "learning_rate": 0.0001, "loss": 3.9092, "loss/crossentropy": 2.276142120361328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21211084723472595, "step": 20792 }, { "epoch": 0.41588, "grad_norm": 1.859375, "grad_norm_var": 0.006125640869140625, "learning_rate": 0.0001, "loss": 4.1577, "loss/crossentropy": 2.050966262817383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20556103438138962, "step": 20794 }, { "epoch": 0.41592, "grad_norm": 1.859375, "grad_norm_var": 0.009218088785807292, "learning_rate": 0.0001, "loss": 4.1697, "loss/crossentropy": 2.135987937450409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21866349130868912, "step": 20796 }, { "epoch": 0.41596, "grad_norm": 1.90625, "grad_norm_var": 0.0089263916015625, "learning_rate": 0.0001, "loss": 4.0574, "loss/crossentropy": 2.0885995030403137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148696705698967, "step": 20798 }, { "epoch": 0.416, "grad_norm": 1.9296875, "grad_norm_var": 0.008471425374348958, "learning_rate": 0.0001, "loss": 3.9976, "loss/crossentropy": 1.8934147953987122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19176077097654343, "step": 20800 }, { "epoch": 0.41604, "grad_norm": 2.0, "grad_norm_var": 0.007970937093098958, "learning_rate": 0.0001, "loss": 4.3367, "loss/crossentropy": 2.355128526687622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21396907418966293, "step": 20802 }, { "epoch": 0.41608, "grad_norm": 1.9765625, "grad_norm_var": 0.005619049072265625, "learning_rate": 0.0001, "loss": 4.1109, "loss/crossentropy": 2.2145700454711914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21031766384840012, "step": 20804 }, { "epoch": 0.41612, "grad_norm": 1.8984375, "grad_norm_var": 0.006048329671223958, "learning_rate": 0.0001, "loss": 3.8883, "loss/crossentropy": 2.17924165725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19039995223283768, "step": 20806 }, { "epoch": 0.41616, "grad_norm": 1.8828125, "grad_norm_var": 0.006017812093098958, "learning_rate": 0.0001, "loss": 4.0044, "loss/crossentropy": 2.2674453258514404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18743296712636948, "step": 20808 }, { "epoch": 0.4162, "grad_norm": 2.0625, "grad_norm_var": 0.052711741129557295, "learning_rate": 0.0001, "loss": 4.1857, "loss/crossentropy": 2.331398367881775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20658230036497116, "step": 20810 }, { "epoch": 0.41624, "grad_norm": 1.9609375, "grad_norm_var": 0.05022354125976562, "learning_rate": 0.0001, "loss": 4.0019, "loss/crossentropy": 1.6092209815979004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16996397078037262, "step": 20812 }, { "epoch": 0.41628, "grad_norm": 1.953125, "grad_norm_var": 0.04944661458333333, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 2.094128370285034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19843773543834686, "step": 20814 }, { "epoch": 0.41632, "grad_norm": 2.046875, "grad_norm_var": 0.050842030843098955, "learning_rate": 0.0001, "loss": 3.9735, "loss/crossentropy": 1.9059642553329468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20919224619865417, "step": 20816 }, { "epoch": 0.41636, "grad_norm": 2.015625, "grad_norm_var": 0.050966135660807294, "learning_rate": 0.0001, "loss": 3.9026, "loss/crossentropy": 2.0733126401901245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132449895143509, "step": 20818 }, { "epoch": 0.4164, "grad_norm": 1.9765625, "grad_norm_var": 0.05125223795572917, "learning_rate": 0.0001, "loss": 4.1978, "loss/crossentropy": 2.09629487991333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21850410103797913, "step": 20820 }, { "epoch": 0.41644, "grad_norm": 1.8984375, "grad_norm_var": 0.05614802042643229, "learning_rate": 0.0001, "loss": 3.6648, "loss/crossentropy": 1.7917864322662354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16684360802173615, "step": 20822 }, { "epoch": 0.41648, "grad_norm": 2.03125, "grad_norm_var": 0.0561187744140625, "learning_rate": 0.0001, "loss": 3.7587, "loss/crossentropy": 1.629518985748291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16551091521978378, "step": 20824 }, { "epoch": 0.41652, "grad_norm": 2.03125, "grad_norm_var": 0.009476725260416667, "learning_rate": 0.0001, "loss": 4.2593, "loss/crossentropy": 2.2774561643600464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21336688101291656, "step": 20826 }, { "epoch": 0.41656, "grad_norm": 1.9375, "grad_norm_var": 0.010684967041015625, "learning_rate": 0.0001, "loss": 3.7644, "loss/crossentropy": 2.0319623947143555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19169747829437256, "step": 20828 }, { "epoch": 0.4166, "grad_norm": 1.921875, "grad_norm_var": 0.010261789957682291, "learning_rate": 0.0001, "loss": 3.7815, "loss/crossentropy": 1.8104961514472961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863522008061409, "step": 20830 }, { "epoch": 0.41664, "grad_norm": 2.09375, "grad_norm_var": 0.0119781494140625, "learning_rate": 0.0001, "loss": 3.815, "loss/crossentropy": 1.729806661605835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17772196978330612, "step": 20832 }, { "epoch": 0.41668, "grad_norm": 2.03125, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 3.9823, "loss/crossentropy": 1.9465582966804504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193453811109066, "step": 20834 }, { "epoch": 0.41672, "grad_norm": 1.953125, "grad_norm_var": 0.0122222900390625, "learning_rate": 0.0001, "loss": 4.1078, "loss/crossentropy": 1.895602285861969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18032599240541458, "step": 20836 }, { "epoch": 0.41676, "grad_norm": 1.84375, "grad_norm_var": 0.009924062093098958, "learning_rate": 0.0001, "loss": 3.7889, "loss/crossentropy": 2.082605481147766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21816352754831314, "step": 20838 }, { "epoch": 0.4168, "grad_norm": 1.8125, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 3.9431, "loss/crossentropy": 1.7751468420028687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18468675762414932, "step": 20840 }, { "epoch": 0.41684, "grad_norm": 1.8671875, "grad_norm_var": 0.007838694254557292, "learning_rate": 0.0001, "loss": 4.1319, "loss/crossentropy": 2.2611895203590393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24100882560014725, "step": 20842 }, { "epoch": 0.41688, "grad_norm": 1.953125, "grad_norm_var": 0.0078125, "learning_rate": 0.0001, "loss": 4.0711, "loss/crossentropy": 2.1387221813201904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980198249220848, "step": 20844 }, { "epoch": 0.41692, "grad_norm": 1.8671875, "grad_norm_var": 0.008206939697265625, "learning_rate": 0.0001, "loss": 4.0165, "loss/crossentropy": 2.385176658630371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22007980942726135, "step": 20846 }, { "epoch": 0.41696, "grad_norm": 1.859375, "grad_norm_var": 0.005273183186848958, "learning_rate": 0.0001, "loss": 3.9463, "loss/crossentropy": 1.9382571578025818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19266389310359955, "step": 20848 }, { "epoch": 0.417, "grad_norm": 2.046875, "grad_norm_var": 0.006583404541015625, "learning_rate": 0.0001, "loss": 4.3108, "loss/crossentropy": 1.91634601354599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147107571363449, "step": 20850 }, { "epoch": 0.41704, "grad_norm": 1.8515625, "grad_norm_var": 0.012532297770182292, "learning_rate": 0.0001, "loss": 3.9546, "loss/crossentropy": 1.9752032160758972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748814195394516, "step": 20852 }, { "epoch": 0.41708, "grad_norm": 1.890625, "grad_norm_var": 0.011818186442057291, "learning_rate": 0.0001, "loss": 3.9149, "loss/crossentropy": 1.844580054283142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17454466968774796, "step": 20854 }, { "epoch": 0.41712, "grad_norm": 1.875, "grad_norm_var": 0.010652669270833333, "learning_rate": 0.0001, "loss": 4.0963, "loss/crossentropy": 2.18678879737854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004237025976181, "step": 20856 }, { "epoch": 0.41716, "grad_norm": 1.90625, "grad_norm_var": 0.010583241780598959, "learning_rate": 0.0001, "loss": 3.855, "loss/crossentropy": 1.7560098767280579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17802055180072784, "step": 20858 }, { "epoch": 0.4172, "grad_norm": 1.9921875, "grad_norm_var": 0.009911092122395833, "learning_rate": 0.0001, "loss": 4.1007, "loss/crossentropy": 1.8778411746025085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892327517271042, "step": 20860 }, { "epoch": 0.41724, "grad_norm": 1.9921875, "grad_norm_var": 0.009220123291015625, "learning_rate": 0.0001, "loss": 3.9034, "loss/crossentropy": 1.8801356554031372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18983188271522522, "step": 20862 }, { "epoch": 0.41728, "grad_norm": 1.9453125, "grad_norm_var": 0.010448201497395834, "learning_rate": 0.0001, "loss": 3.974, "loss/crossentropy": 2.0934754014015198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17912424355745316, "step": 20864 }, { "epoch": 0.41732, "grad_norm": 2.0625, "grad_norm_var": 0.010247548421223959, "learning_rate": 0.0001, "loss": 4.0854, "loss/crossentropy": 2.115522801876068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20105450600385666, "step": 20866 }, { "epoch": 0.41736, "grad_norm": 1.9453125, "grad_norm_var": 0.004622141520182292, "learning_rate": 0.0001, "loss": 4.0041, "loss/crossentropy": 2.1903135776519775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19662026315927505, "step": 20868 }, { "epoch": 0.4174, "grad_norm": 1.9296875, "grad_norm_var": 0.005794016520182291, "learning_rate": 0.0001, "loss": 4.2067, "loss/crossentropy": 2.1521800756454468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20861530303955078, "step": 20870 }, { "epoch": 0.41744, "grad_norm": 1.9765625, "grad_norm_var": 0.006145985921223959, "learning_rate": 0.0001, "loss": 4.4442, "loss/crossentropy": 2.0802451968193054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19014900922775269, "step": 20872 }, { "epoch": 0.41748, "grad_norm": 1.890625, "grad_norm_var": 0.006091054280598958, "learning_rate": 0.0001, "loss": 4.008, "loss/crossentropy": 1.8639289140701294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17143192887306213, "step": 20874 }, { "epoch": 0.41752, "grad_norm": 2.03125, "grad_norm_var": 0.006525675455729167, "learning_rate": 0.0001, "loss": 4.2595, "loss/crossentropy": 2.191206693649292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013956382870674, "step": 20876 }, { "epoch": 0.41756, "grad_norm": 1.8984375, "grad_norm_var": 0.006754557291666667, "learning_rate": 0.0001, "loss": 3.9213, "loss/crossentropy": 1.8131752610206604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932450830936432, "step": 20878 }, { "epoch": 0.4176, "grad_norm": 1.9453125, "grad_norm_var": 0.005018870035807292, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 2.1427782773971558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21525658667087555, "step": 20880 }, { "epoch": 0.41764, "grad_norm": 2.015625, "grad_norm_var": 0.005956013997395833, "learning_rate": 0.0001, "loss": 4.1872, "loss/crossentropy": 1.997750997543335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22109957039356232, "step": 20882 }, { "epoch": 0.41768, "grad_norm": 1.90625, "grad_norm_var": 0.007700347900390625, "learning_rate": 0.0001, "loss": 4.0445, "loss/crossentropy": 2.2799811363220215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18671181797981262, "step": 20884 }, { "epoch": 0.41772, "grad_norm": 1.921875, "grad_norm_var": 0.006886545817057292, "learning_rate": 0.0001, "loss": 3.7884, "loss/crossentropy": 1.887694001197815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19071876257658005, "step": 20886 }, { "epoch": 0.41776, "grad_norm": 2.03125, "grad_norm_var": 0.008235677083333334, "learning_rate": 0.0001, "loss": 4.0758, "loss/crossentropy": 2.0238420367240906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015087753534317, "step": 20888 }, { "epoch": 0.4178, "grad_norm": 1.8828125, "grad_norm_var": 0.008481597900390625, "learning_rate": 0.0001, "loss": 4.3148, "loss/crossentropy": 2.5183900594711304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474501490592957, "step": 20890 }, { "epoch": 0.41784, "grad_norm": 1.9140625, "grad_norm_var": 0.0083160400390625, "learning_rate": 0.0001, "loss": 3.9086, "loss/crossentropy": 2.0730000734329224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18937845528125763, "step": 20892 }, { "epoch": 0.41788, "grad_norm": 1.9296875, "grad_norm_var": 0.008101145426432291, "learning_rate": 0.0001, "loss": 3.9845, "loss/crossentropy": 2.083210587501526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19441235065460205, "step": 20894 }, { "epoch": 0.41792, "grad_norm": 1.7734375, "grad_norm_var": 0.010961659749348958, "learning_rate": 0.0001, "loss": 3.9813, "loss/crossentropy": 1.7236113548278809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931980401277542, "step": 20896 }, { "epoch": 0.41796, "grad_norm": 1.984375, "grad_norm_var": 0.008796183268229167, "learning_rate": 0.0001, "loss": 4.0757, "loss/crossentropy": 2.108364462852478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20710121095180511, "step": 20898 }, { "epoch": 0.418, "grad_norm": 1.7421875, "grad_norm_var": 0.010179646809895833, "learning_rate": 0.0001, "loss": 3.8073, "loss/crossentropy": 2.10055810213089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20653685182332993, "step": 20900 }, { "epoch": 0.41804, "grad_norm": 1.96875, "grad_norm_var": 0.011517079671223958, "learning_rate": 0.0001, "loss": 4.0977, "loss/crossentropy": 2.254370093345642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069462597370148, "step": 20902 }, { "epoch": 0.41808, "grad_norm": 1.8125, "grad_norm_var": 0.010107421875, "learning_rate": 0.0001, "loss": 3.8429, "loss/crossentropy": 2.053595006465912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19040333479642868, "step": 20904 }, { "epoch": 0.41812, "grad_norm": 2.1875, "grad_norm_var": 0.014012654622395834, "learning_rate": 0.0001, "loss": 4.3406, "loss/crossentropy": 2.1739301681518555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19576483964920044, "step": 20906 }, { "epoch": 0.41816, "grad_norm": 2.140625, "grad_norm_var": 0.01635920206705729, "learning_rate": 0.0001, "loss": 4.2794, "loss/crossentropy": 1.9707902073860168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20553794503211975, "step": 20908 }, { "epoch": 0.4182, "grad_norm": 1.9375, "grad_norm_var": 0.017097981770833333, "learning_rate": 0.0001, "loss": 4.0596, "loss/crossentropy": 1.901296079158783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18205050379037857, "step": 20910 }, { "epoch": 0.41824, "grad_norm": 1.8828125, "grad_norm_var": 0.015388997395833333, "learning_rate": 0.0001, "loss": 4.3012, "loss/crossentropy": 2.2531062364578247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18042850494384766, "step": 20912 }, { "epoch": 0.41828, "grad_norm": 1.90625, "grad_norm_var": 0.0159088134765625, "learning_rate": 0.0001, "loss": 3.8839, "loss/crossentropy": 1.959227204322815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19045928120613098, "step": 20914 }, { "epoch": 0.41832, "grad_norm": 1.828125, "grad_norm_var": 0.014207967122395833, "learning_rate": 0.0001, "loss": 3.9595, "loss/crossentropy": 2.1432504057884216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991618126630783, "step": 20916 }, { "epoch": 0.41836, "grad_norm": 1.96875, "grad_norm_var": 0.013423665364583334, "learning_rate": 0.0001, "loss": 4.0369, "loss/crossentropy": 2.090354800224304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19416044652462006, "step": 20918 }, { "epoch": 0.4184, "grad_norm": 2.015625, "grad_norm_var": 0.011889394124348958, "learning_rate": 0.0001, "loss": 4.0133, "loss/crossentropy": 2.0165509581565857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20892751216888428, "step": 20920 }, { "epoch": 0.41844, "grad_norm": 1.890625, "grad_norm_var": 0.009273020426432292, "learning_rate": 0.0001, "loss": 4.0852, "loss/crossentropy": 2.2627620697021484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20449956506490707, "step": 20922 }, { "epoch": 0.41848, "grad_norm": 1.8203125, "grad_norm_var": 0.007249959309895833, "learning_rate": 0.0001, "loss": 3.8586, "loss/crossentropy": 1.9771518111228943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20680005848407745, "step": 20924 }, { "epoch": 0.41852, "grad_norm": 1.921875, "grad_norm_var": 0.006982421875, "learning_rate": 0.0001, "loss": 4.1715, "loss/crossentropy": 2.2706029415130615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22013718634843826, "step": 20926 }, { "epoch": 0.41856, "grad_norm": 1.9375, "grad_norm_var": 0.0060503641764322914, "learning_rate": 0.0001, "loss": 4.1293, "loss/crossentropy": 2.071012258529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21389423310756683, "step": 20928 }, { "epoch": 0.4186, "grad_norm": 1.890625, "grad_norm_var": 0.0071044921875, "learning_rate": 0.0001, "loss": 4.3878, "loss/crossentropy": 2.084509491920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20820914208889008, "step": 20930 }, { "epoch": 0.41864, "grad_norm": 1.9296875, "grad_norm_var": 0.005855051676432291, "learning_rate": 0.0001, "loss": 3.7038, "loss/crossentropy": 2.0449500679969788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20955926179885864, "step": 20932 }, { "epoch": 0.41868, "grad_norm": 2.046875, "grad_norm_var": 0.005881500244140625, "learning_rate": 0.0001, "loss": 4.0819, "loss/crossentropy": 2.0860745310783386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21544789522886276, "step": 20934 }, { "epoch": 0.41872, "grad_norm": 1.9296875, "grad_norm_var": 0.0065877278645833336, "learning_rate": 0.0001, "loss": 3.9627, "loss/crossentropy": 2.042892038822174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20232345163822174, "step": 20936 }, { "epoch": 0.41876, "grad_norm": 1.9765625, "grad_norm_var": 0.006298828125, "learning_rate": 0.0001, "loss": 4.3137, "loss/crossentropy": 2.3840891122817993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21144797652959824, "step": 20938 }, { "epoch": 0.4188, "grad_norm": 1.8671875, "grad_norm_var": 0.005210113525390625, "learning_rate": 0.0001, "loss": 4.1256, "loss/crossentropy": 1.8122249245643616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1800767481327057, "step": 20940 }, { "epoch": 0.41884, "grad_norm": 1.9765625, "grad_norm_var": 0.0052073160807291664, "learning_rate": 0.0001, "loss": 4.2324, "loss/crossentropy": 2.168284773826599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116834670305252, "step": 20942 }, { "epoch": 0.41888, "grad_norm": 1.9609375, "grad_norm_var": 0.006005859375, "learning_rate": 0.0001, "loss": 4.0847, "loss/crossentropy": 2.1386263370513916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18317822366952896, "step": 20944 }, { "epoch": 0.41892, "grad_norm": 1.921875, "grad_norm_var": 0.004829915364583334, "learning_rate": 0.0001, "loss": 3.993, "loss/crossentropy": 2.134578227996826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20232250541448593, "step": 20946 }, { "epoch": 0.41896, "grad_norm": 1.875, "grad_norm_var": 0.005116526285807292, "learning_rate": 0.0001, "loss": 4.212, "loss/crossentropy": 2.138009011745453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20763013511896133, "step": 20948 }, { "epoch": 0.419, "grad_norm": 1.9921875, "grad_norm_var": 0.004881795247395833, "learning_rate": 0.0001, "loss": 3.9959, "loss/crossentropy": 2.1796361207962036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887281984090805, "step": 20950 }, { "epoch": 0.41904, "grad_norm": 2.015625, "grad_norm_var": 0.004369099934895833, "learning_rate": 0.0001, "loss": 3.9328, "loss/crossentropy": 1.9848375916481018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21374072134494781, "step": 20952 }, { "epoch": 0.41908, "grad_norm": 2.015625, "grad_norm_var": 0.003985341389973958, "learning_rate": 0.0001, "loss": 4.1199, "loss/crossentropy": 1.9993594288825989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19865376502275467, "step": 20954 }, { "epoch": 0.41912, "grad_norm": 1.984375, "grad_norm_var": 0.003883616129557292, "learning_rate": 0.0001, "loss": 4.0882, "loss/crossentropy": 1.841040551662445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984993740916252, "step": 20956 }, { "epoch": 0.41916, "grad_norm": 2.203125, "grad_norm_var": 0.007472483317057291, "learning_rate": 0.0001, "loss": 4.3558, "loss/crossentropy": 2.4816187620162964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072821408510208, "step": 20958 }, { "epoch": 0.4192, "grad_norm": 2.140625, "grad_norm_var": 0.009114329020182292, "learning_rate": 0.0001, "loss": 4.3989, "loss/crossentropy": 2.1488978266716003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2619112655520439, "step": 20960 }, { "epoch": 0.41924, "grad_norm": 2.046875, "grad_norm_var": 0.009251912434895834, "learning_rate": 0.0001, "loss": 3.8774, "loss/crossentropy": 1.8684781193733215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915435940027237, "step": 20962 }, { "epoch": 0.41928, "grad_norm": 1.9453125, "grad_norm_var": 0.008599599202473959, "learning_rate": 0.0001, "loss": 4.0564, "loss/crossentropy": 2.2308766841888428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21091555058956146, "step": 20964 }, { "epoch": 0.41932, "grad_norm": 1.8828125, "grad_norm_var": 0.0086822509765625, "learning_rate": 0.0001, "loss": 4.016, "loss/crossentropy": 2.0642316341400146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18509763479232788, "step": 20966 }, { "epoch": 0.41936, "grad_norm": 2.078125, "grad_norm_var": 0.008929189046223958, "learning_rate": 0.0001, "loss": 4.2041, "loss/crossentropy": 2.4577912092208862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22347646206617355, "step": 20968 }, { "epoch": 0.4194, "grad_norm": 2.125, "grad_norm_var": 0.009563954671223958, "learning_rate": 0.0001, "loss": 4.4522, "loss/crossentropy": 2.0276389122009277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18190696835517883, "step": 20970 }, { "epoch": 0.41944, "grad_norm": 1.8515625, "grad_norm_var": 0.011171213785807292, "learning_rate": 0.0001, "loss": 3.7112, "loss/crossentropy": 1.920120656490326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18268628418445587, "step": 20972 }, { "epoch": 0.41948, "grad_norm": 1.8125, "grad_norm_var": 0.010074615478515625, "learning_rate": 0.0001, "loss": 3.671, "loss/crossentropy": 2.0709950923919678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20330249518156052, "step": 20974 }, { "epoch": 0.41952, "grad_norm": 1.8203125, "grad_norm_var": 0.0092926025390625, "learning_rate": 0.0001, "loss": 4.1188, "loss/crossentropy": 2.059907555580139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23229123651981354, "step": 20976 }, { "epoch": 0.41956, "grad_norm": 1.96875, "grad_norm_var": 0.012190755208333333, "learning_rate": 0.0001, "loss": 3.8434, "loss/crossentropy": 1.9516863226890564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19371748715639114, "step": 20978 }, { "epoch": 0.4196, "grad_norm": 1.9296875, "grad_norm_var": 0.012181599934895834, "learning_rate": 0.0001, "loss": 3.8149, "loss/crossentropy": 1.8882723450660706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1830422580242157, "step": 20980 }, { "epoch": 0.41964, "grad_norm": 1.9296875, "grad_norm_var": 0.011864980061848959, "learning_rate": 0.0001, "loss": 4.2085, "loss/crossentropy": 2.3770724534988403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055138498544693, "step": 20982 }, { "epoch": 0.41968, "grad_norm": 1.90625, "grad_norm_var": 0.010221354166666667, "learning_rate": 0.0001, "loss": 4.0082, "loss/crossentropy": 2.302065849304199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21600044518709183, "step": 20984 }, { "epoch": 0.41972, "grad_norm": 1.953125, "grad_norm_var": 0.00772705078125, "learning_rate": 0.0001, "loss": 4.2794, "loss/crossentropy": 2.1097174286842346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151128277182579, "step": 20986 }, { "epoch": 0.41976, "grad_norm": 1.875, "grad_norm_var": 0.007765452067057292, "learning_rate": 0.0001, "loss": 4.0175, "loss/crossentropy": 2.1860098838806152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20798291265964508, "step": 20988 }, { "epoch": 0.4198, "grad_norm": 1.90625, "grad_norm_var": 0.007258097330729167, "learning_rate": 0.0001, "loss": 3.7789, "loss/crossentropy": 2.0685555934906006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20412929356098175, "step": 20990 }, { "epoch": 0.41984, "grad_norm": 2.0, "grad_norm_var": 0.0064046223958333336, "learning_rate": 0.0001, "loss": 3.9187, "loss/crossentropy": 1.886826515197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17476174980401993, "step": 20992 }, { "epoch": 0.41988, "grad_norm": 1.9296875, "grad_norm_var": 0.003446197509765625, "learning_rate": 0.0001, "loss": 3.853, "loss/crossentropy": 1.8504652380943298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18863234668970108, "step": 20994 }, { "epoch": 0.41992, "grad_norm": 1.9140625, "grad_norm_var": 0.0035308837890625, "learning_rate": 0.0001, "loss": 4.0992, "loss/crossentropy": 2.396555781364441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21075168251991272, "step": 20996 }, { "epoch": 0.41996, "grad_norm": 2.203125, "grad_norm_var": 0.008676910400390625, "learning_rate": 0.0001, "loss": 4.0947, "loss/crossentropy": 2.005809009075165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158650904893875, "step": 20998 }, { "epoch": 0.42, "grad_norm": 1.9375, "grad_norm_var": 0.008660634358723959, "learning_rate": 0.0001, "loss": 3.8813, "loss/crossentropy": 1.8755770325660706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19334855675697327, "step": 21000 }, { "epoch": 0.42004, "grad_norm": 1.8828125, "grad_norm_var": 0.007989247639973959, "learning_rate": 0.0001, "loss": 4.1168, "loss/crossentropy": 2.4014203548431396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22790876030921936, "step": 21002 }, { "epoch": 0.42008, "grad_norm": 1.96875, "grad_norm_var": 0.008983357747395834, "learning_rate": 0.0001, "loss": 3.7571, "loss/crossentropy": 1.959916114807129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19611585140228271, "step": 21004 }, { "epoch": 0.42012, "grad_norm": 1.8984375, "grad_norm_var": 0.010212961832682292, "learning_rate": 0.0001, "loss": 4.0505, "loss/crossentropy": 2.077211081981659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19385989010334015, "step": 21006 }, { "epoch": 0.42016, "grad_norm": 2.328125, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 4.2484, "loss/crossentropy": 2.1105176210403442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.303642213344574, "step": 21008 }, { "epoch": 0.4202, "grad_norm": 1.921875, "grad_norm_var": 0.01786066691080729, "learning_rate": 0.0001, "loss": 4.0644, "loss/crossentropy": 2.0568641424179077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2137533351778984, "step": 21010 }, { "epoch": 0.42024, "grad_norm": 1.921875, "grad_norm_var": 0.018424224853515626, "learning_rate": 0.0001, "loss": 3.9592, "loss/crossentropy": 2.185602903366089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20309215039014816, "step": 21012 }, { "epoch": 0.42028, "grad_norm": 2.0, "grad_norm_var": 0.014646148681640625, "learning_rate": 0.0001, "loss": 4.1891, "loss/crossentropy": 1.9848479628562927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918111816048622, "step": 21014 }, { "epoch": 0.42032, "grad_norm": 1.9765625, "grad_norm_var": 0.014731597900390626, "learning_rate": 0.0001, "loss": 4.167, "loss/crossentropy": 2.315872311592102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22417553514242172, "step": 21016 }, { "epoch": 0.42036, "grad_norm": 1.90625, "grad_norm_var": 0.014798736572265625, "learning_rate": 0.0001, "loss": 3.9912, "loss/crossentropy": 2.1230897903442383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19763804227113724, "step": 21018 }, { "epoch": 0.4204, "grad_norm": 2.015625, "grad_norm_var": 0.012947336832682291, "learning_rate": 0.0001, "loss": 4.2364, "loss/crossentropy": 1.9737728834152222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18397405743598938, "step": 21020 }, { "epoch": 0.42044, "grad_norm": 1.9375, "grad_norm_var": 0.012064615885416666, "learning_rate": 0.0001, "loss": 4.0749, "loss/crossentropy": 2.1349278688430786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923152655363083, "step": 21022 }, { "epoch": 0.42048, "grad_norm": 1.9921875, "grad_norm_var": 0.007242584228515625, "learning_rate": 0.0001, "loss": 4.1186, "loss/crossentropy": 2.110077440738678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115027979016304, "step": 21024 }, { "epoch": 0.42052, "grad_norm": 1.9140625, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 4.0164, "loss/crossentropy": 2.0237202048301697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17709830403327942, "step": 21026 }, { "epoch": 0.42056, "grad_norm": 1.9453125, "grad_norm_var": 0.006990559895833333, "learning_rate": 0.0001, "loss": 4.2527, "loss/crossentropy": 2.1712347269058228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20721115171909332, "step": 21028 }, { "epoch": 0.4206, "grad_norm": 1.890625, "grad_norm_var": 0.007054646809895833, "learning_rate": 0.0001, "loss": 4.087, "loss/crossentropy": 2.1889408826828003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2252693474292755, "step": 21030 }, { "epoch": 0.42064, "grad_norm": 1.890625, "grad_norm_var": 0.009051259358723958, "learning_rate": 0.0001, "loss": 4.2847, "loss/crossentropy": 1.9189648032188416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20265216380357742, "step": 21032 }, { "epoch": 0.42068, "grad_norm": 2.046875, "grad_norm_var": 0.009008534749348958, "learning_rate": 0.0001, "loss": 4.2777, "loss/crossentropy": 2.335593581199646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21812127530574799, "step": 21034 }, { "epoch": 0.42072, "grad_norm": 1.90625, "grad_norm_var": 0.010723622639973958, "learning_rate": 0.0001, "loss": 3.9306, "loss/crossentropy": 2.0845232605934143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974613294005394, "step": 21036 }, { "epoch": 0.42076, "grad_norm": 1.96875, "grad_norm_var": 0.010528310139973959, "learning_rate": 0.0001, "loss": 4.2117, "loss/crossentropy": 2.0062036514282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18701466172933578, "step": 21038 }, { "epoch": 0.4208, "grad_norm": 1.96875, "grad_norm_var": 0.006520334879557292, "learning_rate": 0.0001, "loss": 4.1506, "loss/crossentropy": 2.30399227142334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108677700161934, "step": 21040 }, { "epoch": 0.42084, "grad_norm": 2.046875, "grad_norm_var": 0.006036122639973958, "learning_rate": 0.0001, "loss": 4.4162, "loss/crossentropy": 2.0211732387542725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925763636827469, "step": 21042 }, { "epoch": 0.42088, "grad_norm": 1.9140625, "grad_norm_var": 0.006986490885416667, "learning_rate": 0.0001, "loss": 4.2366, "loss/crossentropy": 2.259310483932495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19453367590904236, "step": 21044 }, { "epoch": 0.42092, "grad_norm": 1.8984375, "grad_norm_var": 0.006818644205729167, "learning_rate": 0.0001, "loss": 3.8528, "loss/crossentropy": 2.225229859352112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21418282389640808, "step": 21046 }, { "epoch": 0.42096, "grad_norm": 1.953125, "grad_norm_var": 0.004667154947916667, "learning_rate": 0.0001, "loss": 3.993, "loss/crossentropy": 2.007095217704773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1860651671886444, "step": 21048 }, { "epoch": 0.421, "grad_norm": 1.921875, "grad_norm_var": 0.004117838541666667, "learning_rate": 0.0001, "loss": 3.8646, "loss/crossentropy": 2.2098451256752014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123607099056244, "step": 21050 }, { "epoch": 0.42104, "grad_norm": 1.96875, "grad_norm_var": 0.0024861653645833334, "learning_rate": 0.0001, "loss": 3.9383, "loss/crossentropy": 2.038383424282074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19423578679561615, "step": 21052 }, { "epoch": 0.42108, "grad_norm": 1.828125, "grad_norm_var": 0.0047910054524739586, "learning_rate": 0.0001, "loss": 3.5858, "loss/crossentropy": 1.5344518423080444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15087325125932693, "step": 21054 }, { "epoch": 0.42112, "grad_norm": 1.9921875, "grad_norm_var": 0.0050351460774739586, "learning_rate": 0.0001, "loss": 3.7398, "loss/crossentropy": 2.0475123524665833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19996189326047897, "step": 21056 }, { "epoch": 0.42116, "grad_norm": 1.9296875, "grad_norm_var": 0.0047604878743489586, "learning_rate": 0.0001, "loss": 3.8581, "loss/crossentropy": 1.9760377407073975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20170825719833374, "step": 21058 }, { "epoch": 0.4212, "grad_norm": 2.25, "grad_norm_var": 0.009871419270833333, "learning_rate": 0.0001, "loss": 4.0876, "loss/crossentropy": 2.0718571543693542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987539902329445, "step": 21060 }, { "epoch": 0.42124, "grad_norm": 1.84375, "grad_norm_var": 0.010908762613932291, "learning_rate": 0.0001, "loss": 3.7226, "loss/crossentropy": 1.694058895111084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15808547288179398, "step": 21062 }, { "epoch": 0.42128, "grad_norm": 2.0, "grad_norm_var": 0.010827382405598959, "learning_rate": 0.0001, "loss": 3.9852, "loss/crossentropy": 2.024729013442993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19920113682746887, "step": 21064 }, { "epoch": 0.42132, "grad_norm": 2.28125, "grad_norm_var": 0.01858495076497396, "learning_rate": 0.0001, "loss": 4.3691, "loss/crossentropy": 2.2473320960998535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366955801844597, "step": 21066 }, { "epoch": 0.42136, "grad_norm": 1.9765625, "grad_norm_var": 0.020230865478515624, "learning_rate": 0.0001, "loss": 4.014, "loss/crossentropy": 1.8571689128875732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198233962059021, "step": 21068 }, { "epoch": 0.4214, "grad_norm": 2.0625, "grad_norm_var": 0.018314361572265625, "learning_rate": 0.0001, "loss": 4.066, "loss/crossentropy": 1.7178888320922852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756814494729042, "step": 21070 }, { "epoch": 0.42144, "grad_norm": 2.453125, "grad_norm_var": 0.031840006510416664, "learning_rate": 0.0001, "loss": 4.0847, "loss/crossentropy": 2.5577595233917236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22383909672498703, "step": 21072 }, { "epoch": 0.42148, "grad_norm": 1.8828125, "grad_norm_var": 0.030926259358723958, "learning_rate": 0.0001, "loss": 4.1138, "loss/crossentropy": 1.8517940640449524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17597733438014984, "step": 21074 }, { "epoch": 0.42152, "grad_norm": 1.7265625, "grad_norm_var": 0.033934529622395834, "learning_rate": 0.0001, "loss": 4.1207, "loss/crossentropy": 2.2587300539016724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20134828239679337, "step": 21076 }, { "epoch": 0.42156, "grad_norm": 2.109375, "grad_norm_var": 0.04946263631184896, "learning_rate": 0.0001, "loss": 4.0223, "loss/crossentropy": 1.9288156032562256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2372005432844162, "step": 21078 }, { "epoch": 0.4216, "grad_norm": 2.109375, "grad_norm_var": 0.051454416910807294, "learning_rate": 0.0001, "loss": 3.9205, "loss/crossentropy": 1.7192566990852356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19468504190444946, "step": 21080 }, { "epoch": 0.42164, "grad_norm": 1.9296875, "grad_norm_var": 0.0494293212890625, "learning_rate": 0.0001, "loss": 4.1596, "loss/crossentropy": 2.2686651945114136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18164453655481339, "step": 21082 }, { "epoch": 0.42168, "grad_norm": 1.8359375, "grad_norm_var": 0.048685455322265626, "learning_rate": 0.0001, "loss": 4.1317, "loss/crossentropy": 2.090322732925415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19734591990709305, "step": 21084 }, { "epoch": 0.42172, "grad_norm": 2.0, "grad_norm_var": 0.047761027018229166, "learning_rate": 0.0001, "loss": 4.1072, "loss/crossentropy": 2.3203768730163574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18960122764110565, "step": 21086 }, { "epoch": 0.42176, "grad_norm": 1.9296875, "grad_norm_var": 0.04411392211914063, "learning_rate": 0.0001, "loss": 4.1253, "loss/crossentropy": 2.1767812967300415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20625658333301544, "step": 21088 }, { "epoch": 0.4218, "grad_norm": 1.875, "grad_norm_var": 0.0453125, "learning_rate": 0.0001, "loss": 3.9737, "loss/crossentropy": 2.1308469772338867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20404084771871567, "step": 21090 }, { "epoch": 0.42184, "grad_norm": 2.109375, "grad_norm_var": 0.03766988118489583, "learning_rate": 0.0001, "loss": 4.0643, "loss/crossentropy": 2.34736967086792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22817441076040268, "step": 21092 }, { "epoch": 0.42188, "grad_norm": 1.9296875, "grad_norm_var": 0.019730631510416666, "learning_rate": 0.0001, "loss": 3.7592, "loss/crossentropy": 1.9515716433525085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18946269899606705, "step": 21094 }, { "epoch": 0.42192, "grad_norm": 1.90625, "grad_norm_var": 0.019181315104166666, "learning_rate": 0.0001, "loss": 4.2118, "loss/crossentropy": 2.1175013184547424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25575874000787735, "step": 21096 }, { "epoch": 0.42196, "grad_norm": 1.9375, "grad_norm_var": 0.019465128580729168, "learning_rate": 0.0001, "loss": 3.9934, "loss/crossentropy": 1.8839566707611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1840147227048874, "step": 21098 }, { "epoch": 0.422, "grad_norm": 2.015625, "grad_norm_var": 0.01761042277018229, "learning_rate": 0.0001, "loss": 3.9215, "loss/crossentropy": 1.8690025806427002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20930250734090805, "step": 21100 }, { "epoch": 0.42204, "grad_norm": 2.015625, "grad_norm_var": 0.01693700154622396, "learning_rate": 0.0001, "loss": 4.0877, "loss/crossentropy": 2.0368083119392395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19566239416599274, "step": 21102 }, { "epoch": 0.42208, "grad_norm": 2.078125, "grad_norm_var": 0.009384918212890624, "learning_rate": 0.0001, "loss": 3.9941, "loss/crossentropy": 1.5155547261238098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1755489930510521, "step": 21104 }, { "epoch": 0.42212, "grad_norm": 1.9140625, "grad_norm_var": 0.006259918212890625, "learning_rate": 0.0001, "loss": 3.9463, "loss/crossentropy": 1.850695788860321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17723418772220612, "step": 21106 }, { "epoch": 0.42216, "grad_norm": 1.828125, "grad_norm_var": 0.0071441650390625, "learning_rate": 0.0001, "loss": 3.8903, "loss/crossentropy": 2.0554863810539246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18637049198150635, "step": 21108 }, { "epoch": 0.4222, "grad_norm": 1.953125, "grad_norm_var": 0.0073883056640625, "learning_rate": 0.0001, "loss": 3.9719, "loss/crossentropy": 1.6186088919639587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17206687480211258, "step": 21110 }, { "epoch": 0.42224, "grad_norm": 1.8984375, "grad_norm_var": 0.006131744384765625, "learning_rate": 0.0001, "loss": 4.1193, "loss/crossentropy": 2.326562762260437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22168086469173431, "step": 21112 }, { "epoch": 0.42228, "grad_norm": 1.9296875, "grad_norm_var": 0.007670084635416667, "learning_rate": 0.0001, "loss": 3.8155, "loss/crossentropy": 1.8151599764823914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16791456937789917, "step": 21114 }, { "epoch": 0.42232, "grad_norm": 2.03125, "grad_norm_var": 0.007249959309895833, "learning_rate": 0.0001, "loss": 4.2537, "loss/crossentropy": 1.8597796559333801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19767651706933975, "step": 21116 }, { "epoch": 0.42236, "grad_norm": 1.9375, "grad_norm_var": 0.0056884765625, "learning_rate": 0.0001, "loss": 4.0409, "loss/crossentropy": 1.9591187238693237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18850377202033997, "step": 21118 }, { "epoch": 0.4224, "grad_norm": 2.09375, "grad_norm_var": 0.007684071858723958, "learning_rate": 0.0001, "loss": 4.1848, "loss/crossentropy": 2.0987173318862915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087177261710167, "step": 21120 }, { "epoch": 0.42244, "grad_norm": 2.015625, "grad_norm_var": 0.011010487874348959, "learning_rate": 0.0001, "loss": 3.96, "loss/crossentropy": 1.8050614595413208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19238876551389694, "step": 21122 }, { "epoch": 0.42248, "grad_norm": 2.015625, "grad_norm_var": 0.010487620035807292, "learning_rate": 0.0001, "loss": 4.1689, "loss/crossentropy": 2.0409955978393555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230370223522186, "step": 21124 }, { "epoch": 0.42252, "grad_norm": 1.890625, "grad_norm_var": 0.009942372639973959, "learning_rate": 0.0001, "loss": 4.2033, "loss/crossentropy": 2.2344201803207397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21372146904468536, "step": 21126 }, { "epoch": 0.42256, "grad_norm": 2.015625, "grad_norm_var": 0.010033162434895833, "learning_rate": 0.0001, "loss": 3.9102, "loss/crossentropy": 1.905364751815796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17828336358070374, "step": 21128 }, { "epoch": 0.4226, "grad_norm": 8.6875, "grad_norm_var": 2.8468658447265627, "learning_rate": 0.0001, "loss": 4.4863, "loss/crossentropy": 2.118414044380188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183245092630386, "step": 21130 }, { "epoch": 0.42264, "grad_norm": 2.421875, "grad_norm_var": 2.826851399739583, "learning_rate": 0.0001, "loss": 3.9736, "loss/crossentropy": 1.8499796390533447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17906417697668076, "step": 21132 }, { "epoch": 0.42268, "grad_norm": 2.0625, "grad_norm_var": 2.808918253580729, "learning_rate": 0.0001, "loss": 3.9669, "loss/crossentropy": 2.046007513999939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20209533721208572, "step": 21134 }, { "epoch": 0.42272, "grad_norm": 2.078125, "grad_norm_var": 2.7881795247395833, "learning_rate": 0.0001, "loss": 4.4089, "loss/crossentropy": 2.4684035778045654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23784233629703522, "step": 21136 }, { "epoch": 0.42276, "grad_norm": 1.9296875, "grad_norm_var": 2.781150054931641, "learning_rate": 0.0001, "loss": 3.9599, "loss/crossentropy": 2.2614521980285645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20271501690149307, "step": 21138 }, { "epoch": 0.4228, "grad_norm": 1.9375, "grad_norm_var": 2.778649648030599, "learning_rate": 0.0001, "loss": 3.995, "loss/crossentropy": 1.9676395058631897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1826155036687851, "step": 21140 }, { "epoch": 0.42284, "grad_norm": 2.015625, "grad_norm_var": 2.77156982421875, "learning_rate": 0.0001, "loss": 4.0974, "loss/crossentropy": 1.9320645928382874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19518838077783585, "step": 21142 }, { "epoch": 0.42288, "grad_norm": 2.546875, "grad_norm_var": 2.751301066080729, "learning_rate": 0.0001, "loss": 4.2261, "loss/crossentropy": 2.0839805603027344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20529074221849442, "step": 21144 }, { "epoch": 0.42292, "grad_norm": 1.8671875, "grad_norm_var": 0.04561538696289062, "learning_rate": 0.0001, "loss": 3.6995, "loss/crossentropy": 1.7191408276557922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19476453214883804, "step": 21146 }, { "epoch": 0.42296, "grad_norm": 2.0625, "grad_norm_var": 0.026151275634765624, "learning_rate": 0.0001, "loss": 4.3019, "loss/crossentropy": 2.3838316202163696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2231682687997818, "step": 21148 }, { "epoch": 0.423, "grad_norm": 2.078125, "grad_norm_var": 0.02588678995768229, "learning_rate": 0.0001, "loss": 4.3851, "loss/crossentropy": 2.5376522541046143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22086318582296371, "step": 21150 }, { "epoch": 0.42304, "grad_norm": 1.8671875, "grad_norm_var": 0.0267486572265625, "learning_rate": 0.0001, "loss": 4.0375, "loss/crossentropy": 2.0386282801628113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19398655742406845, "step": 21152 }, { "epoch": 0.42308, "grad_norm": 1.984375, "grad_norm_var": 0.025047810872395833, "learning_rate": 0.0001, "loss": 4.0863, "loss/crossentropy": 1.8140466213226318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843537762761116, "step": 21154 }, { "epoch": 0.42312, "grad_norm": 2.078125, "grad_norm_var": 0.026839192708333334, "learning_rate": 0.0001, "loss": 3.8526, "loss/crossentropy": 1.7746369242668152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17255257815122604, "step": 21156 }, { "epoch": 0.42316, "grad_norm": 1.9296875, "grad_norm_var": 0.0270904541015625, "learning_rate": 0.0001, "loss": 3.9869, "loss/crossentropy": 1.9952461123466492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18875698745250702, "step": 21158 }, { "epoch": 0.4232, "grad_norm": 2.125, "grad_norm_var": 0.0079742431640625, "learning_rate": 0.0001, "loss": 4.1292, "loss/crossentropy": 2.1353421211242676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20545977354049683, "step": 21160 }, { "epoch": 0.42324, "grad_norm": 1.90625, "grad_norm_var": 0.0072662353515625, "learning_rate": 0.0001, "loss": 3.8585, "loss/crossentropy": 1.93438321352005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2099536955356598, "step": 21162 }, { "epoch": 0.42328, "grad_norm": 2.125, "grad_norm_var": 0.008714803059895833, "learning_rate": 0.0001, "loss": 4.1526, "loss/crossentropy": 2.085531711578369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19908609241247177, "step": 21164 }, { "epoch": 0.42332, "grad_norm": 1.890625, "grad_norm_var": 0.010716756184895834, "learning_rate": 0.0001, "loss": 3.7828, "loss/crossentropy": 1.7960018515586853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17612508684396744, "step": 21166 }, { "epoch": 0.42336, "grad_norm": 2.03125, "grad_norm_var": 0.011982981363932292, "learning_rate": 0.0001, "loss": 4.0211, "loss/crossentropy": 2.3417539596557617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124033421278, "step": 21168 }, { "epoch": 0.4234, "grad_norm": 2.375, "grad_norm_var": 0.02384211222330729, "learning_rate": 0.0001, "loss": 4.0768, "loss/crossentropy": 1.7829700708389282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17676468938589096, "step": 21170 }, { "epoch": 0.42344, "grad_norm": 1.9375, "grad_norm_var": 0.027815500895182293, "learning_rate": 0.0001, "loss": 4.0011, "loss/crossentropy": 1.8243988156318665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1778729408979416, "step": 21172 }, { "epoch": 0.42348, "grad_norm": 1.9765625, "grad_norm_var": 0.02774225870768229, "learning_rate": 0.0001, "loss": 4.1254, "loss/crossentropy": 2.0906782150268555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19730936735868454, "step": 21174 }, { "epoch": 0.42352, "grad_norm": 1.9140625, "grad_norm_var": 0.026041666666666668, "learning_rate": 0.0001, "loss": 4.0611, "loss/crossentropy": 2.0885696411132812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19815433770418167, "step": 21176 }, { "epoch": 0.42356, "grad_norm": 1.8203125, "grad_norm_var": 0.027513631184895835, "learning_rate": 0.0001, "loss": 3.7571, "loss/crossentropy": 1.9133245944976807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18601036816835403, "step": 21178 }, { "epoch": 0.4236, "grad_norm": 1.890625, "grad_norm_var": 0.026071929931640626, "learning_rate": 0.0001, "loss": 3.9254, "loss/crossentropy": 2.3077515363693237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21091386675834656, "step": 21180 }, { "epoch": 0.42364, "grad_norm": 1.796875, "grad_norm_var": 0.0247222900390625, "learning_rate": 0.0001, "loss": 3.9337, "loss/crossentropy": 2.170205235481262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22082476317882538, "step": 21182 }, { "epoch": 0.42368, "grad_norm": 1.984375, "grad_norm_var": 0.0234619140625, "learning_rate": 0.0001, "loss": 4.1234, "loss/crossentropy": 2.053133964538574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19210617244243622, "step": 21184 }, { "epoch": 0.42372, "grad_norm": 1.984375, "grad_norm_var": 0.011930084228515625, "learning_rate": 0.0001, "loss": 4.1882, "loss/crossentropy": 2.246233820915222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110241949558258, "step": 21186 }, { "epoch": 0.42376, "grad_norm": 2.015625, "grad_norm_var": 0.005378214518229166, "learning_rate": 0.0001, "loss": 4.0148, "loss/crossentropy": 2.088374972343445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20536644011735916, "step": 21188 }, { "epoch": 0.4238, "grad_norm": 2.03125, "grad_norm_var": 0.006105295817057292, "learning_rate": 0.0001, "loss": 4.08, "loss/crossentropy": 2.2355328798294067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208805114030838, "step": 21190 }, { "epoch": 0.42384, "grad_norm": 2.125, "grad_norm_var": 0.008707682291666666, "learning_rate": 0.0001, "loss": 4.3974, "loss/crossentropy": 2.3286044001579285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21351350098848343, "step": 21192 }, { "epoch": 0.42388, "grad_norm": 1.9375, "grad_norm_var": 0.008131663004557291, "learning_rate": 0.0001, "loss": 4.1763, "loss/crossentropy": 2.188312590122223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166001871228218, "step": 21194 }, { "epoch": 0.42392, "grad_norm": 1.9140625, "grad_norm_var": 0.007873280843098959, "learning_rate": 0.0001, "loss": 3.9396, "loss/crossentropy": 2.1229456663131714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19442371279001236, "step": 21196 }, { "epoch": 0.42396, "grad_norm": 12.0, "grad_norm_var": 6.295235188802083, "learning_rate": 0.0001, "loss": 4.0976, "loss/crossentropy": 2.000286102294922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20243800431489944, "step": 21198 }, { "epoch": 0.424, "grad_norm": 2.078125, "grad_norm_var": 6.265840657552084, "learning_rate": 0.0001, "loss": 4.204, "loss/crossentropy": 2.0628392100334167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20828108489513397, "step": 21200 }, { "epoch": 0.42404, "grad_norm": 2.09375, "grad_norm_var": 6.274006144205729, "learning_rate": 0.0001, "loss": 4.0036, "loss/crossentropy": 1.6969141364097595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20169565826654434, "step": 21202 }, { "epoch": 0.42408, "grad_norm": 1.9609375, "grad_norm_var": 6.258829752604167, "learning_rate": 0.0001, "loss": 4.1475, "loss/crossentropy": 2.078153133392334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974920853972435, "step": 21204 }, { "epoch": 0.42412, "grad_norm": 2.03125, "grad_norm_var": 6.2670237223307295, "learning_rate": 0.0001, "loss": 3.7707, "loss/crossentropy": 1.9656822681427002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20239949226379395, "step": 21206 }, { "epoch": 0.42416, "grad_norm": 1.890625, "grad_norm_var": 6.289948527018229, "learning_rate": 0.0001, "loss": 4.0624, "loss/crossentropy": 1.721840500831604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1692550852894783, "step": 21208 }, { "epoch": 0.4242, "grad_norm": 1.9375, "grad_norm_var": 6.2876942952473955, "learning_rate": 0.0001, "loss": 4.0437, "loss/crossentropy": 2.093570113182068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127639278769493, "step": 21210 }, { "epoch": 0.42424, "grad_norm": 2.09375, "grad_norm_var": 6.257684071858724, "learning_rate": 0.0001, "loss": 4.1692, "loss/crossentropy": 2.476475954055786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22028189152479172, "step": 21212 }, { "epoch": 0.42428, "grad_norm": 2.3125, "grad_norm_var": 0.015653228759765624, "learning_rate": 0.0001, "loss": 3.8084, "loss/crossentropy": 2.1308672428131104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215510457754135, "step": 21214 }, { "epoch": 0.42432, "grad_norm": 1.9296875, "grad_norm_var": 0.014709218343098959, "learning_rate": 0.0001, "loss": 3.9183, "loss/crossentropy": 1.9697207808494568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23525278270244598, "step": 21216 }, { "epoch": 0.42436, "grad_norm": 2.03125, "grad_norm_var": 0.012943267822265625, "learning_rate": 0.0001, "loss": 4.3044, "loss/crossentropy": 2.3466382026672363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21148498356342316, "step": 21218 }, { "epoch": 0.4244, "grad_norm": 2.0, "grad_norm_var": 0.013421376546223959, "learning_rate": 0.0001, "loss": 4.0281, "loss/crossentropy": 1.9455206990242004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18886880576610565, "step": 21220 }, { "epoch": 0.42444, "grad_norm": 2.09375, "grad_norm_var": 0.010949452718098959, "learning_rate": 0.0001, "loss": 4.113, "loss/crossentropy": 2.030101954936981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19099503010511398, "step": 21222 }, { "epoch": 0.42448, "grad_norm": 1.9765625, "grad_norm_var": 0.0103912353515625, "learning_rate": 0.0001, "loss": 3.9738, "loss/crossentropy": 2.0859753489494324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20600532740354538, "step": 21224 }, { "epoch": 0.42452, "grad_norm": 2.03125, "grad_norm_var": 0.009749348958333333, "learning_rate": 0.0001, "loss": 4.0856, "loss/crossentropy": 1.9598749279975891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18913735449314117, "step": 21226 }, { "epoch": 0.42456, "grad_norm": 1.8203125, "grad_norm_var": 0.011791737874348958, "learning_rate": 0.0001, "loss": 3.8938, "loss/crossentropy": 1.8889980912208557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892652064561844, "step": 21228 }, { "epoch": 0.4246, "grad_norm": 1.9296875, "grad_norm_var": 0.005224355061848958, "learning_rate": 0.0001, "loss": 4.0116, "loss/crossentropy": 2.0479432940483093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20084280520677567, "step": 21230 }, { "epoch": 0.42464, "grad_norm": 2.046875, "grad_norm_var": 0.005407460530598958, "learning_rate": 0.0001, "loss": 4.1743, "loss/crossentropy": 2.2049208879470825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21012280881404877, "step": 21232 }, { "epoch": 0.42468, "grad_norm": 1.8125, "grad_norm_var": 0.006945546468098958, "learning_rate": 0.0001, "loss": 3.8626, "loss/crossentropy": 2.1863686442375183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19185375422239304, "step": 21234 }, { "epoch": 0.42472, "grad_norm": 1.8515625, "grad_norm_var": 0.010564931233723958, "learning_rate": 0.0001, "loss": 4.0699, "loss/crossentropy": 2.144856631755829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018456757068634, "step": 21236 }, { "epoch": 0.42476, "grad_norm": 1.8828125, "grad_norm_var": 0.0170806884765625, "learning_rate": 0.0001, "loss": 4.0695, "loss/crossentropy": 2.0092588663101196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18666698783636093, "step": 21238 }, { "epoch": 0.4248, "grad_norm": 1.9921875, "grad_norm_var": 0.018365224202473957, "learning_rate": 0.0001, "loss": 3.8831, "loss/crossentropy": 1.9071536660194397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925549954175949, "step": 21240 }, { "epoch": 0.42484, "grad_norm": 1.9453125, "grad_norm_var": 0.01822509765625, "learning_rate": 0.0001, "loss": 3.7471, "loss/crossentropy": 1.7945414185523987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18291736394166946, "step": 21242 }, { "epoch": 0.42488, "grad_norm": 1.8984375, "grad_norm_var": 0.017252604166666668, "learning_rate": 0.0001, "loss": 3.96, "loss/crossentropy": 2.0980335474014282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20030297338962555, "step": 21244 }, { "epoch": 0.42492, "grad_norm": 1.84375, "grad_norm_var": 0.01800715128580729, "learning_rate": 0.0001, "loss": 3.8988, "loss/crossentropy": 1.8301831483840942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18173757195472717, "step": 21246 }, { "epoch": 0.42496, "grad_norm": 2.015625, "grad_norm_var": 0.020734659830729165, "learning_rate": 0.0001, "loss": 3.8851, "loss/crossentropy": 2.1006619930267334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21144358813762665, "step": 21248 }, { "epoch": 0.425, "grad_norm": 1.90625, "grad_norm_var": 0.019220987955729168, "learning_rate": 0.0001, "loss": 4.1271, "loss/crossentropy": 2.2240719199180603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21180906891822815, "step": 21250 }, { "epoch": 0.42504, "grad_norm": 1.8984375, "grad_norm_var": 0.024079386393229166, "learning_rate": 0.0001, "loss": 4.2143, "loss/crossentropy": 2.187661051750183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19673307240009308, "step": 21252 }, { "epoch": 0.42508, "grad_norm": 1.8515625, "grad_norm_var": 0.07988993326822917, "learning_rate": 0.0001, "loss": 3.9913, "loss/crossentropy": 2.17081356048584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19375403225421906, "step": 21254 }, { "epoch": 0.42512, "grad_norm": 2.03125, "grad_norm_var": 0.0834673563639323, "learning_rate": 0.0001, "loss": 3.7945, "loss/crossentropy": 1.6309250593185425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17680996656417847, "step": 21256 }, { "epoch": 0.42516, "grad_norm": 2.171875, "grad_norm_var": 0.08495992024739583, "learning_rate": 0.0001, "loss": 3.97, "loss/crossentropy": 1.9581794142723083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18848375976085663, "step": 21258 }, { "epoch": 0.4252, "grad_norm": 1.9765625, "grad_norm_var": 0.0839508056640625, "learning_rate": 0.0001, "loss": 4.1796, "loss/crossentropy": 1.9206833839416504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18920007348060608, "step": 21260 }, { "epoch": 0.42524, "grad_norm": 1.9765625, "grad_norm_var": 0.08524169921875, "learning_rate": 0.0001, "loss": 3.8909, "loss/crossentropy": 1.736355721950531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17645896971225739, "step": 21262 }, { "epoch": 0.42528, "grad_norm": 2.328125, "grad_norm_var": 0.0872711181640625, "learning_rate": 0.0001, "loss": 4.0856, "loss/crossentropy": 1.9830248355865479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18551607429981232, "step": 21264 }, { "epoch": 0.42532, "grad_norm": 1.8359375, "grad_norm_var": 0.08831558227539063, "learning_rate": 0.0001, "loss": 3.8254, "loss/crossentropy": 1.8477718234062195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18887290358543396, "step": 21266 }, { "epoch": 0.42536, "grad_norm": 1.921875, "grad_norm_var": 0.08277359008789062, "learning_rate": 0.0001, "loss": 3.8833, "loss/crossentropy": 2.024181544780731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20996837317943573, "step": 21268 }, { "epoch": 0.4254, "grad_norm": 1.984375, "grad_norm_var": 0.021930948893229166, "learning_rate": 0.0001, "loss": 4.0265, "loss/crossentropy": 2.153610110282898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962708830833435, "step": 21270 }, { "epoch": 0.42544, "grad_norm": 1.8359375, "grad_norm_var": 0.019359334309895834, "learning_rate": 0.0001, "loss": 4.1288, "loss/crossentropy": 2.142420172691345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2167380526661873, "step": 21272 }, { "epoch": 0.42548, "grad_norm": 1.984375, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 4.1983, "loss/crossentropy": 2.091115117073059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20703484117984772, "step": 21274 }, { "epoch": 0.42552, "grad_norm": 2.015625, "grad_norm_var": 0.017235310872395833, "learning_rate": 0.0001, "loss": 4.1653, "loss/crossentropy": 2.0014833211898804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172148898243904, "step": 21276 }, { "epoch": 0.42556, "grad_norm": 1.9453125, "grad_norm_var": 0.015340169270833334, "learning_rate": 0.0001, "loss": 3.9513, "loss/crossentropy": 1.8580491542816162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861124113202095, "step": 21278 }, { "epoch": 0.4256, "grad_norm": 1.859375, "grad_norm_var": 0.0071980794270833336, "learning_rate": 0.0001, "loss": 4.1013, "loss/crossentropy": 2.1323113441467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839580088853836, "step": 21280 }, { "epoch": 0.42564, "grad_norm": 1.8984375, "grad_norm_var": 0.005952707926432292, "learning_rate": 0.0001, "loss": 4.0295, "loss/crossentropy": 2.107018530368805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19513815641403198, "step": 21282 }, { "epoch": 0.42568, "grad_norm": 2.0, "grad_norm_var": 0.005882771809895834, "learning_rate": 0.0001, "loss": 3.989, "loss/crossentropy": 2.025463044643402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19870947301387787, "step": 21284 }, { "epoch": 0.42572, "grad_norm": 2.125, "grad_norm_var": 0.007429758707682292, "learning_rate": 0.0001, "loss": 4.2469, "loss/crossentropy": 2.348206877708435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2326420098543167, "step": 21286 }, { "epoch": 0.42576, "grad_norm": 1.90625, "grad_norm_var": 0.005037434895833333, "learning_rate": 0.0001, "loss": 4.1699, "loss/crossentropy": 2.2780548334121704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905437633395195, "step": 21288 }, { "epoch": 0.4258, "grad_norm": 1.953125, "grad_norm_var": 0.004400380452473958, "learning_rate": 0.0001, "loss": 4.1373, "loss/crossentropy": 2.1915369629859924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19397883117198944, "step": 21290 }, { "epoch": 0.42584, "grad_norm": 2.015625, "grad_norm_var": 0.004353586832682292, "learning_rate": 0.0001, "loss": 4.0237, "loss/crossentropy": 2.132679283618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21279992163181305, "step": 21292 }, { "epoch": 0.42588, "grad_norm": 2.0625, "grad_norm_var": 0.006058756510416667, "learning_rate": 0.0001, "loss": 3.9847, "loss/crossentropy": 1.860496699810028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1666220724582672, "step": 21294 }, { "epoch": 0.42592, "grad_norm": 1.9921875, "grad_norm_var": 0.005535634358723959, "learning_rate": 0.0001, "loss": 4.1154, "loss/crossentropy": 2.031413435935974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919916793704033, "step": 21296 }, { "epoch": 0.42596, "grad_norm": 1.953125, "grad_norm_var": 0.005296834309895833, "learning_rate": 0.0001, "loss": 3.7703, "loss/crossentropy": 1.6095055937767029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1592901274561882, "step": 21298 }, { "epoch": 0.426, "grad_norm": 1.9453125, "grad_norm_var": 0.0052886962890625, "learning_rate": 0.0001, "loss": 4.1652, "loss/crossentropy": 2.002205550670624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18984580785036087, "step": 21300 }, { "epoch": 0.42604, "grad_norm": 1.9375, "grad_norm_var": 0.0029856363932291665, "learning_rate": 0.0001, "loss": 4.1755, "loss/crossentropy": 2.022138476371765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1904059648513794, "step": 21302 }, { "epoch": 0.42608, "grad_norm": 1.8515625, "grad_norm_var": 0.0036801656087239584, "learning_rate": 0.0001, "loss": 3.8417, "loss/crossentropy": 1.917612910270691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18372280895709991, "step": 21304 }, { "epoch": 0.42612, "grad_norm": 1.96875, "grad_norm_var": 0.004426829020182292, "learning_rate": 0.0001, "loss": 3.8031, "loss/crossentropy": 1.827630877494812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18420201539993286, "step": 21306 }, { "epoch": 0.42616, "grad_norm": 1.875, "grad_norm_var": 0.004115549723307291, "learning_rate": 0.0001, "loss": 3.8427, "loss/crossentropy": 1.7498629689216614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1638321429491043, "step": 21308 }, { "epoch": 0.4262, "grad_norm": 2.0625, "grad_norm_var": 0.0036374409993489582, "learning_rate": 0.0001, "loss": 4.0028, "loss/crossentropy": 1.7890136241912842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18237848579883575, "step": 21310 }, { "epoch": 0.42624, "grad_norm": 1.9609375, "grad_norm_var": 0.0033518473307291665, "learning_rate": 0.0001, "loss": 4.1649, "loss/crossentropy": 2.17901873588562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104976773262024, "step": 21312 }, { "epoch": 0.42628, "grad_norm": 1.9140625, "grad_norm_var": 0.003987630208333333, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.322218656539917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20789335668087006, "step": 21314 }, { "epoch": 0.42632, "grad_norm": 2.015625, "grad_norm_var": 0.004515584309895833, "learning_rate": 0.0001, "loss": 4.2208, "loss/crossentropy": 2.4203622341156006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21352334320545197, "step": 21316 }, { "epoch": 0.42636, "grad_norm": 2.125, "grad_norm_var": 0.008388010660807292, "learning_rate": 0.0001, "loss": 3.7956, "loss/crossentropy": 2.2349472045898438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009701356291771, "step": 21318 }, { "epoch": 0.4264, "grad_norm": 2.078125, "grad_norm_var": 0.009511057535807292, "learning_rate": 0.0001, "loss": 3.9733, "loss/crossentropy": 1.8173826336860657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736140102148056, "step": 21320 }, { "epoch": 0.42644, "grad_norm": 1.7421875, "grad_norm_var": 0.011315663655598959, "learning_rate": 0.0001, "loss": 3.6665, "loss/crossentropy": 2.099972426891327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19424229115247726, "step": 21322 }, { "epoch": 0.42648, "grad_norm": 1.9375, "grad_norm_var": 0.06534830729166667, "learning_rate": 0.0001, "loss": 3.9213, "loss/crossentropy": 1.8005958795547485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17232687771320343, "step": 21324 }, { "epoch": 0.42652, "grad_norm": 2.09375, "grad_norm_var": 0.06697184244791667, "learning_rate": 0.0001, "loss": 4.1434, "loss/crossentropy": 2.0184829235076904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19896141439676285, "step": 21326 }, { "epoch": 0.42656, "grad_norm": 1.7421875, "grad_norm_var": 0.07361551920572916, "learning_rate": 0.0001, "loss": 3.7018, "loss/crossentropy": 2.0207254886627197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863861232995987, "step": 21328 }, { "epoch": 0.4266, "grad_norm": 1.9609375, "grad_norm_var": 0.07342020670572917, "learning_rate": 0.0001, "loss": 4.2953, "loss/crossentropy": 2.1894538402557373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21097677946090698, "step": 21330 }, { "epoch": 0.42664, "grad_norm": 1.8125, "grad_norm_var": 0.07691650390625, "learning_rate": 0.0001, "loss": 4.1044, "loss/crossentropy": 2.1939873695373535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1935359239578247, "step": 21332 }, { "epoch": 0.42668, "grad_norm": 1.9375, "grad_norm_var": 0.07349624633789062, "learning_rate": 0.0001, "loss": 4.1346, "loss/crossentropy": 2.131330966949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001090720295906, "step": 21334 }, { "epoch": 0.42672, "grad_norm": 1.828125, "grad_norm_var": 0.083154296875, "learning_rate": 0.0001, "loss": 3.9866, "loss/crossentropy": 1.9254841208457947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20036878436803818, "step": 21336 }, { "epoch": 0.42676, "grad_norm": 1.8828125, "grad_norm_var": 0.07875874837239584, "learning_rate": 0.0001, "loss": 4.0641, "loss/crossentropy": 1.9841803312301636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19805853068828583, "step": 21338 }, { "epoch": 0.4268, "grad_norm": 1.8828125, "grad_norm_var": 0.02767918904622396, "learning_rate": 0.0001, "loss": 3.6806, "loss/crossentropy": 1.6822729110717773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18236130475997925, "step": 21340 }, { "epoch": 0.42684, "grad_norm": 1.9453125, "grad_norm_var": 0.023789215087890624, "learning_rate": 0.0001, "loss": 4.1128, "loss/crossentropy": 2.0904295444488525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19326913356781006, "step": 21342 }, { "epoch": 0.42688, "grad_norm": 1.953125, "grad_norm_var": 0.01951878865559896, "learning_rate": 0.0001, "loss": 4.0758, "loss/crossentropy": 2.111131250858307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202858105301857, "step": 21344 }, { "epoch": 0.42692, "grad_norm": 1.890625, "grad_norm_var": 0.019197591145833335, "learning_rate": 0.0001, "loss": 4.2061, "loss/crossentropy": 2.2793352603912354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20112870633602142, "step": 21346 }, { "epoch": 0.42696, "grad_norm": 1.9140625, "grad_norm_var": 0.016112263997395834, "learning_rate": 0.0001, "loss": 4.1173, "loss/crossentropy": 2.253583312034607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21494006365537643, "step": 21348 }, { "epoch": 0.427, "grad_norm": 1.9140625, "grad_norm_var": 0.016532389322916667, "learning_rate": 0.0001, "loss": 4.2371, "loss/crossentropy": 1.9753515124320984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828732267022133, "step": 21350 }, { "epoch": 0.42704, "grad_norm": 1.921875, "grad_norm_var": 0.0023455301920572917, "learning_rate": 0.0001, "loss": 4.0961, "loss/crossentropy": 2.110904037952423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19640487432479858, "step": 21352 }, { "epoch": 0.42708, "grad_norm": 1.9765625, "grad_norm_var": 0.0022905985514322918, "learning_rate": 0.0001, "loss": 4.1961, "loss/crossentropy": 2.323657512664795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206008955836296, "step": 21354 }, { "epoch": 0.42712, "grad_norm": 1.8984375, "grad_norm_var": 0.003507232666015625, "learning_rate": 0.0001, "loss": 4.1613, "loss/crossentropy": 2.113715887069702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18136650323867798, "step": 21356 }, { "epoch": 0.42716, "grad_norm": 1.96875, "grad_norm_var": 0.004207102457682291, "learning_rate": 0.0001, "loss": 3.9951, "loss/crossentropy": 2.096550941467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20972034335136414, "step": 21358 }, { "epoch": 0.4272, "grad_norm": 1.9375, "grad_norm_var": 0.004449208577473958, "learning_rate": 0.0001, "loss": 4.0398, "loss/crossentropy": 2.1959888339042664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19834591448307037, "step": 21360 }, { "epoch": 0.42724, "grad_norm": 1.8125, "grad_norm_var": 0.0054351806640625, "learning_rate": 0.0001, "loss": 4.0003, "loss/crossentropy": 2.146053671836853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18752741813659668, "step": 21362 }, { "epoch": 0.42728, "grad_norm": 1.8671875, "grad_norm_var": 0.005728912353515625, "learning_rate": 0.0001, "loss": 3.6564, "loss/crossentropy": 1.9178010821342468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20413734018802643, "step": 21364 }, { "epoch": 0.42732, "grad_norm": 1.9453125, "grad_norm_var": 0.005283355712890625, "learning_rate": 0.0001, "loss": 3.884, "loss/crossentropy": 1.8283003568649292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882033348083496, "step": 21366 }, { "epoch": 0.42736, "grad_norm": 2.1875, "grad_norm_var": 0.0103759765625, "learning_rate": 0.0001, "loss": 4.362, "loss/crossentropy": 2.1956799030303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21325727552175522, "step": 21368 }, { "epoch": 0.4274, "grad_norm": 1.9140625, "grad_norm_var": 0.010381825764973958, "learning_rate": 0.0001, "loss": 3.9304, "loss/crossentropy": 1.8081734776496887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1818179190158844, "step": 21370 }, { "epoch": 0.42744, "grad_norm": 1.921875, "grad_norm_var": 0.008699544270833333, "learning_rate": 0.0001, "loss": 4.3029, "loss/crossentropy": 2.283393979072571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20135364681482315, "step": 21372 }, { "epoch": 0.42748, "grad_norm": 1.7890625, "grad_norm_var": 0.013038889567057291, "learning_rate": 0.0001, "loss": 3.7118, "loss/crossentropy": 1.9896268844604492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17712793499231339, "step": 21374 }, { "epoch": 0.42752, "grad_norm": 2.0, "grad_norm_var": 0.017289225260416666, "learning_rate": 0.0001, "loss": 4.2804, "loss/crossentropy": 2.1862595081329346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19685917347669601, "step": 21376 }, { "epoch": 0.42756, "grad_norm": 1.890625, "grad_norm_var": 0.016355133056640624, "learning_rate": 0.0001, "loss": 3.9852, "loss/crossentropy": 2.097273588180542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204267680644989, "step": 21378 }, { "epoch": 0.4276, "grad_norm": 1.9140625, "grad_norm_var": 0.016031646728515626, "learning_rate": 0.0001, "loss": 4.0112, "loss/crossentropy": 2.0905693769454956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17958275228738785, "step": 21380 }, { "epoch": 0.42764, "grad_norm": 2.0, "grad_norm_var": 0.016502888997395833, "learning_rate": 0.0001, "loss": 4.2815, "loss/crossentropy": 2.1066064834594727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18819285184144974, "step": 21382 }, { "epoch": 0.42768, "grad_norm": 2.109375, "grad_norm_var": 0.0134429931640625, "learning_rate": 0.0001, "loss": 4.2721, "loss/crossentropy": 1.9919481873512268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19579682499170303, "step": 21384 }, { "epoch": 0.42772, "grad_norm": 1.9921875, "grad_norm_var": 0.01337890625, "learning_rate": 0.0001, "loss": 3.9186, "loss/crossentropy": 1.8611761927604675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929011344909668, "step": 21386 }, { "epoch": 0.42776, "grad_norm": 1.9453125, "grad_norm_var": 0.015592193603515625, "learning_rate": 0.0001, "loss": 3.9346, "loss/crossentropy": 2.232940196990967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934458464384079, "step": 21388 }, { "epoch": 0.4278, "grad_norm": 2.046875, "grad_norm_var": 0.0073883056640625, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.2017111778259277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22515927255153656, "step": 21390 }, { "epoch": 0.42784, "grad_norm": 2.1875, "grad_norm_var": 0.0079498291015625, "learning_rate": 0.0001, "loss": 4.3437, "loss/crossentropy": 2.4802298545837402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20159347355365753, "step": 21392 }, { "epoch": 0.42788, "grad_norm": 2.03125, "grad_norm_var": 0.0065093994140625, "learning_rate": 0.0001, "loss": 4.1095, "loss/crossentropy": 2.1320372819900513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19364330172538757, "step": 21394 }, { "epoch": 0.42792, "grad_norm": 1.8671875, "grad_norm_var": 0.0069488525390625, "learning_rate": 0.0001, "loss": 4.2004, "loss/crossentropy": 2.115567684173584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150493860244751, "step": 21396 }, { "epoch": 0.42796, "grad_norm": 2.1875, "grad_norm_var": 0.008259073893229166, "learning_rate": 0.0001, "loss": 4.4131, "loss/crossentropy": 2.3852893114089966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21371107548475266, "step": 21398 }, { "epoch": 0.428, "grad_norm": 2.0625, "grad_norm_var": 0.010204060872395834, "learning_rate": 0.0001, "loss": 3.9377, "loss/crossentropy": 1.872315526008606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17347048223018646, "step": 21400 }, { "epoch": 0.42804, "grad_norm": 3.9375, "grad_norm_var": 0.23815816243489582, "learning_rate": 0.0001, "loss": 4.0048, "loss/crossentropy": 2.064896881580353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20905783772468567, "step": 21402 }, { "epoch": 0.42808, "grad_norm": 1.90625, "grad_norm_var": 0.24268290201822917, "learning_rate": 0.0001, "loss": 3.6606, "loss/crossentropy": 1.90069580078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19062762707471848, "step": 21404 }, { "epoch": 0.42812, "grad_norm": 1.84375, "grad_norm_var": 0.24911473592122396, "learning_rate": 0.0001, "loss": 4.1083, "loss/crossentropy": 1.9533473253250122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17531338334083557, "step": 21406 }, { "epoch": 0.42816, "grad_norm": 2.03125, "grad_norm_var": 0.25371475219726564, "learning_rate": 0.0001, "loss": 3.9376, "loss/crossentropy": 1.9835584163665771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19894006848335266, "step": 21408 }, { "epoch": 0.4282, "grad_norm": 1.8359375, "grad_norm_var": 0.26023661295572914, "learning_rate": 0.0001, "loss": 3.6515, "loss/crossentropy": 2.2370243072509766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20410479605197906, "step": 21410 }, { "epoch": 0.42824, "grad_norm": 2.09375, "grad_norm_var": 0.2575152079264323, "learning_rate": 0.0001, "loss": 3.8597, "loss/crossentropy": 2.0012041330337524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18770772218704224, "step": 21412 }, { "epoch": 0.42828, "grad_norm": 1.8984375, "grad_norm_var": 0.2589312235514323, "learning_rate": 0.0001, "loss": 4.1147, "loss/crossentropy": 2.0049465894699097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19158685207366943, "step": 21414 }, { "epoch": 0.42832, "grad_norm": 2.046875, "grad_norm_var": 0.2571207682291667, "learning_rate": 0.0001, "loss": 3.9892, "loss/crossentropy": 2.1261045932769775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21888310462236404, "step": 21416 }, { "epoch": 0.42836, "grad_norm": 2.046875, "grad_norm_var": 0.007771809895833333, "learning_rate": 0.0001, "loss": 4.2261, "loss/crossentropy": 2.3438754081726074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22312359511852264, "step": 21418 }, { "epoch": 0.4284, "grad_norm": 1.9453125, "grad_norm_var": 0.0075457255045572914, "learning_rate": 0.0001, "loss": 4.0333, "loss/crossentropy": 1.970156729221344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17968790978193283, "step": 21420 }, { "epoch": 0.42844, "grad_norm": 1.9765625, "grad_norm_var": 0.007356516520182292, "learning_rate": 0.0001, "loss": 4.1106, "loss/crossentropy": 2.0128689408302307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190870463848114, "step": 21422 }, { "epoch": 0.42848, "grad_norm": 2.734375, "grad_norm_var": 0.044755045572916666, "learning_rate": 0.0001, "loss": 4.0063, "loss/crossentropy": 1.766348421573639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1758265122771263, "step": 21424 }, { "epoch": 0.42852, "grad_norm": 2.125, "grad_norm_var": 0.041757965087890626, "learning_rate": 0.0001, "loss": 4.1498, "loss/crossentropy": 1.9646649360656738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052430659532547, "step": 21426 }, { "epoch": 0.42856, "grad_norm": 1.921875, "grad_norm_var": 0.042281087239583334, "learning_rate": 0.0001, "loss": 4.0817, "loss/crossentropy": 1.8953626155853271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974174976348877, "step": 21428 }, { "epoch": 0.4286, "grad_norm": 1.765625, "grad_norm_var": 0.04601008097330729, "learning_rate": 0.0001, "loss": 3.7752, "loss/crossentropy": 1.9713319540023804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18385013192892075, "step": 21430 }, { "epoch": 0.42864, "grad_norm": 1.875, "grad_norm_var": 0.047501627604166666, "learning_rate": 0.0001, "loss": 4.1555, "loss/crossentropy": 2.2564018964767456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19498983025550842, "step": 21432 }, { "epoch": 0.42868, "grad_norm": 2.21875, "grad_norm_var": 0.05091044108072917, "learning_rate": 0.0001, "loss": 4.1351, "loss/crossentropy": 2.1691616773605347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22087589651346207, "step": 21434 }, { "epoch": 0.42872, "grad_norm": 9.1875, "grad_norm_var": 3.257096354166667, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 1.9714866280555725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19079908728599548, "step": 21436 }, { "epoch": 0.42876, "grad_norm": 2.0, "grad_norm_var": 3.2328834533691406, "learning_rate": 0.0001, "loss": 4.0687, "loss/crossentropy": 2.2311829328536987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19929265975952148, "step": 21438 }, { "epoch": 0.4288, "grad_norm": 1.984375, "grad_norm_var": 3.231501261393229, "learning_rate": 0.0001, "loss": 4.0397, "loss/crossentropy": 1.9644780158996582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1768122911453247, "step": 21440 }, { "epoch": 0.42884, "grad_norm": 1.796875, "grad_norm_var": 3.253226725260417, "learning_rate": 0.0001, "loss": 3.9993, "loss/crossentropy": 1.93829345703125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19605614244937897, "step": 21442 }, { "epoch": 0.42888, "grad_norm": 1.9140625, "grad_norm_var": 3.2553304036458335, "learning_rate": 0.0001, "loss": 3.8987, "loss/crossentropy": 1.9488537907600403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20282328873872757, "step": 21444 }, { "epoch": 0.42892, "grad_norm": 1.75, "grad_norm_var": 3.254095204671224, "learning_rate": 0.0001, "loss": 3.8275, "loss/crossentropy": 1.93411123752594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18476436287164688, "step": 21446 }, { "epoch": 0.42896, "grad_norm": 1.8828125, "grad_norm_var": 3.24100341796875, "learning_rate": 0.0001, "loss": 4.1485, "loss/crossentropy": 2.4150885343551636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21736841648817062, "step": 21448 }, { "epoch": 0.429, "grad_norm": 2.078125, "grad_norm_var": 3.2536610921223956, "learning_rate": 0.0001, "loss": 4.227, "loss/crossentropy": 2.117435574531555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20417648553848267, "step": 21450 }, { "epoch": 0.42904, "grad_norm": 1.8984375, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 3.9162, "loss/crossentropy": 2.0840513706207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20294595509767532, "step": 21452 }, { "epoch": 0.42908, "grad_norm": 1.9296875, "grad_norm_var": 0.0087066650390625, "learning_rate": 0.0001, "loss": 3.9088, "loss/crossentropy": 2.1203198432922363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19258832931518555, "step": 21454 }, { "epoch": 0.42912, "grad_norm": 1.8984375, "grad_norm_var": 0.007503000895182291, "learning_rate": 0.0001, "loss": 4.1205, "loss/crossentropy": 2.1161458492279053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18999946117401123, "step": 21456 }, { "epoch": 0.42916, "grad_norm": 1.828125, "grad_norm_var": 0.006876373291015625, "learning_rate": 0.0001, "loss": 3.9288, "loss/crossentropy": 1.7208130955696106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20022748410701752, "step": 21458 }, { "epoch": 0.4292, "grad_norm": 2.046875, "grad_norm_var": 0.007940419514973958, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 2.0148571729660034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059563249349594, "step": 21460 }, { "epoch": 0.42924, "grad_norm": 1.8203125, "grad_norm_var": 0.006615193684895834, "learning_rate": 0.0001, "loss": 4.0039, "loss/crossentropy": 1.985020637512207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18583452701568604, "step": 21462 }, { "epoch": 0.42928, "grad_norm": 1.8984375, "grad_norm_var": 0.0056955973307291664, "learning_rate": 0.0001, "loss": 3.9623, "loss/crossentropy": 1.8767080903053284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18215155601501465, "step": 21464 }, { "epoch": 0.42932, "grad_norm": 1.9375, "grad_norm_var": 0.0037638346354166665, "learning_rate": 0.0001, "loss": 4.1592, "loss/crossentropy": 2.162920832633972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20505908131599426, "step": 21466 }, { "epoch": 0.42936, "grad_norm": 2.203125, "grad_norm_var": 0.00985107421875, "learning_rate": 0.0001, "loss": 4.0253, "loss/crossentropy": 2.329980731010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23726581782102585, "step": 21468 }, { "epoch": 0.4294, "grad_norm": 1.8984375, "grad_norm_var": 0.01512451171875, "learning_rate": 0.0001, "loss": 4.0204, "loss/crossentropy": 2.020410180091858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19389280676841736, "step": 21470 }, { "epoch": 0.42944, "grad_norm": 1.9921875, "grad_norm_var": 0.01613133748372396, "learning_rate": 0.0001, "loss": 4.0849, "loss/crossentropy": 2.157116711139679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991463601589203, "step": 21472 }, { "epoch": 0.42948, "grad_norm": 1.9453125, "grad_norm_var": 0.014992014567057291, "learning_rate": 0.0001, "loss": 4.0913, "loss/crossentropy": 2.0108723640441895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19546864926815033, "step": 21474 }, { "epoch": 0.42952, "grad_norm": 1.953125, "grad_norm_var": 0.0141998291015625, "learning_rate": 0.0001, "loss": 4.2555, "loss/crossentropy": 2.154988646507263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19204716384410858, "step": 21476 }, { "epoch": 0.42956, "grad_norm": 1.890625, "grad_norm_var": 0.013549550374348959, "learning_rate": 0.0001, "loss": 3.7924, "loss/crossentropy": 1.9463204145431519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041161060333252, "step": 21478 }, { "epoch": 0.4296, "grad_norm": 2.015625, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 4.1798, "loss/crossentropy": 2.3123443126678467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21649713814258575, "step": 21480 }, { "epoch": 0.42964, "grad_norm": 1.9140625, "grad_norm_var": 0.015160115559895833, "learning_rate": 0.0001, "loss": 3.8571, "loss/crossentropy": 1.658549726009369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16132741421461105, "step": 21482 }, { "epoch": 0.42968, "grad_norm": 1.8515625, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 3.8965, "loss/crossentropy": 1.9251441955566406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17892278730869293, "step": 21484 }, { "epoch": 0.42972, "grad_norm": 1.9609375, "grad_norm_var": 0.004523722330729166, "learning_rate": 0.0001, "loss": 4.002, "loss/crossentropy": 1.7956182956695557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17938754707574844, "step": 21486 }, { "epoch": 0.42976, "grad_norm": 2.046875, "grad_norm_var": 0.004312896728515625, "learning_rate": 0.0001, "loss": 4.0373, "loss/crossentropy": 1.941636562347412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159031555056572, "step": 21488 }, { "epoch": 0.4298, "grad_norm": 1.9765625, "grad_norm_var": 0.0047108968098958336, "learning_rate": 0.0001, "loss": 4.2861, "loss/crossentropy": 2.4158248901367188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2346770316362381, "step": 21490 }, { "epoch": 0.42984, "grad_norm": 2.171875, "grad_norm_var": 0.009110260009765624, "learning_rate": 0.0001, "loss": 4.0334, "loss/crossentropy": 1.749859094619751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17030366510152817, "step": 21492 }, { "epoch": 0.42988, "grad_norm": 1.9609375, "grad_norm_var": 0.008622233072916667, "learning_rate": 0.0001, "loss": 4.1944, "loss/crossentropy": 2.0735827684402466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986066773533821, "step": 21494 }, { "epoch": 0.42992, "grad_norm": 1.859375, "grad_norm_var": 0.008556874593098958, "learning_rate": 0.0001, "loss": 3.8175, "loss/crossentropy": 1.744678020477295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16986121982336044, "step": 21496 }, { "epoch": 0.42996, "grad_norm": 1.8984375, "grad_norm_var": 0.007559967041015625, "learning_rate": 0.0001, "loss": 4.116, "loss/crossentropy": 2.060591220855713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20758916437625885, "step": 21498 }, { "epoch": 0.43, "grad_norm": 1.8515625, "grad_norm_var": 0.0074045817057291664, "learning_rate": 0.0001, "loss": 3.967, "loss/crossentropy": 2.2836644649505615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20620425045490265, "step": 21500 }, { "epoch": 0.43004, "grad_norm": 1.7421875, "grad_norm_var": 0.010252888997395833, "learning_rate": 0.0001, "loss": 3.9827, "loss/crossentropy": 2.1133594512939453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954261213541031, "step": 21502 }, { "epoch": 0.43008, "grad_norm": 2.03125, "grad_norm_var": 0.0100250244140625, "learning_rate": 0.0001, "loss": 4.0973, "loss/crossentropy": 2.5492414236068726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23168637603521347, "step": 21504 }, { "epoch": 0.43012, "grad_norm": 1.78125, "grad_norm_var": 0.0113922119140625, "learning_rate": 0.0001, "loss": 3.6322, "loss/crossentropy": 1.7122142314910889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16849397122859955, "step": 21506 }, { "epoch": 0.43016, "grad_norm": 2.015625, "grad_norm_var": 0.0077288309733072914, "learning_rate": 0.0001, "loss": 4.0675, "loss/crossentropy": 2.3052932024002075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21252303570508957, "step": 21508 }, { "epoch": 0.4302, "grad_norm": 2.046875, "grad_norm_var": 0.009570058186848958, "learning_rate": 0.0001, "loss": 3.9751, "loss/crossentropy": 2.5156666040420532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26235880702733994, "step": 21510 }, { "epoch": 0.43024, "grad_norm": 1.9375, "grad_norm_var": 0.009419504801432292, "learning_rate": 0.0001, "loss": 3.8061, "loss/crossentropy": 1.884554922580719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1867503896355629, "step": 21512 }, { "epoch": 0.43028, "grad_norm": 2.25, "grad_norm_var": 0.01619440714518229, "learning_rate": 0.0001, "loss": 4.1092, "loss/crossentropy": 2.1004000902175903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19210458546876907, "step": 21514 }, { "epoch": 0.43032, "grad_norm": 1.890625, "grad_norm_var": 0.01565526326497396, "learning_rate": 0.0001, "loss": 4.0146, "loss/crossentropy": 1.606240451335907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15947996824979782, "step": 21516 }, { "epoch": 0.43036, "grad_norm": 1.90625, "grad_norm_var": 0.013130696614583333, "learning_rate": 0.0001, "loss": 4.1195, "loss/crossentropy": 1.9417288303375244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19736050814390182, "step": 21518 }, { "epoch": 0.4304, "grad_norm": 1.8203125, "grad_norm_var": 0.014204915364583333, "learning_rate": 0.0001, "loss": 3.8322, "loss/crossentropy": 1.9607113599777222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17349773645401, "step": 21520 }, { "epoch": 0.43044, "grad_norm": 1.8828125, "grad_norm_var": 0.012105305989583334, "learning_rate": 0.0001, "loss": 3.9789, "loss/crossentropy": 1.9606893062591553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726546436548233, "step": 21522 }, { "epoch": 0.43048, "grad_norm": 2.03125, "grad_norm_var": 0.011915842692057291, "learning_rate": 0.0001, "loss": 4.1203, "loss/crossentropy": 1.9426813125610352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19272266328334808, "step": 21524 }, { "epoch": 0.43052, "grad_norm": 1.78125, "grad_norm_var": 0.011533355712890625, "learning_rate": 0.0001, "loss": 3.8158, "loss/crossentropy": 2.1843879222869873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986662521958351, "step": 21526 }, { "epoch": 0.43056, "grad_norm": 1.90625, "grad_norm_var": 0.011146799723307291, "learning_rate": 0.0001, "loss": 3.8314, "loss/crossentropy": 1.7259829640388489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1759045571088791, "step": 21528 }, { "epoch": 0.4306, "grad_norm": 2.015625, "grad_norm_var": 0.004670969645182292, "learning_rate": 0.0001, "loss": 3.7475, "loss/crossentropy": 1.7714526653289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20490919053554535, "step": 21530 }, { "epoch": 0.43064, "grad_norm": 2.046875, "grad_norm_var": 0.005631256103515625, "learning_rate": 0.0001, "loss": 3.905, "loss/crossentropy": 1.9385235905647278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19307191669940948, "step": 21532 }, { "epoch": 0.43068, "grad_norm": 1.9375, "grad_norm_var": 0.006034088134765625, "learning_rate": 0.0001, "loss": 4.2582, "loss/crossentropy": 1.956704020500183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24551841616630554, "step": 21534 }, { "epoch": 0.43072, "grad_norm": 2.015625, "grad_norm_var": 0.0048736572265625, "learning_rate": 0.0001, "loss": 4.0436, "loss/crossentropy": 2.0809181332588196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20348872244358063, "step": 21536 }, { "epoch": 0.43076, "grad_norm": 1.9609375, "grad_norm_var": 0.004589589436848959, "learning_rate": 0.0001, "loss": 4.1488, "loss/crossentropy": 1.9148198366165161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19790956377983093, "step": 21538 }, { "epoch": 0.4308, "grad_norm": 2.171875, "grad_norm_var": 0.007207997639973958, "learning_rate": 0.0001, "loss": 3.9439, "loss/crossentropy": 2.0833089351654053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19176144897937775, "step": 21540 }, { "epoch": 0.43084, "grad_norm": 1.828125, "grad_norm_var": 0.006376139322916667, "learning_rate": 0.0001, "loss": 4.0987, "loss/crossentropy": 2.1066776514053345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20604360103607178, "step": 21542 }, { "epoch": 0.43088, "grad_norm": 1.921875, "grad_norm_var": 0.006248982747395834, "learning_rate": 0.0001, "loss": 3.9294, "loss/crossentropy": 2.125930666923523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426339238882065, "step": 21544 }, { "epoch": 0.43092, "grad_norm": 1.90625, "grad_norm_var": 0.0062978108723958336, "learning_rate": 0.0001, "loss": 3.8652, "loss/crossentropy": 1.743666172027588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1681881994009018, "step": 21546 }, { "epoch": 0.43096, "grad_norm": 2.078125, "grad_norm_var": 0.00712890625, "learning_rate": 0.0001, "loss": 3.8952, "loss/crossentropy": 1.882837951183319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18062414973974228, "step": 21548 }, { "epoch": 0.431, "grad_norm": 1.78125, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 3.6019, "loss/crossentropy": 1.8259800672531128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19174453616142273, "step": 21550 }, { "epoch": 0.43104, "grad_norm": 1.8671875, "grad_norm_var": 0.009199778238932291, "learning_rate": 0.0001, "loss": 4.0255, "loss/crossentropy": 2.3379745483398438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22536730766296387, "step": 21552 }, { "epoch": 0.43108, "grad_norm": 2.09375, "grad_norm_var": 0.010560862223307292, "learning_rate": 0.0001, "loss": 3.9447, "loss/crossentropy": 2.123952627182007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830175817012787, "step": 21554 }, { "epoch": 0.43112, "grad_norm": 1.9375, "grad_norm_var": 0.006849924723307292, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 2.286398410797119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20413320511579514, "step": 21556 }, { "epoch": 0.43116, "grad_norm": 2.015625, "grad_norm_var": 0.006044260660807292, "learning_rate": 0.0001, "loss": 3.9117, "loss/crossentropy": 2.0898618698120117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2105274796485901, "step": 21558 }, { "epoch": 0.4312, "grad_norm": 2.015625, "grad_norm_var": 0.0067789713541666664, "learning_rate": 0.0001, "loss": 4.1431, "loss/crossentropy": 1.940380573272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18282227218151093, "step": 21560 }, { "epoch": 0.43124, "grad_norm": 2.078125, "grad_norm_var": 0.007657623291015625, "learning_rate": 0.0001, "loss": 3.9952, "loss/crossentropy": 1.6752060055732727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20306075364351273, "step": 21562 }, { "epoch": 0.43128, "grad_norm": 2.234375, "grad_norm_var": 0.011091105143229167, "learning_rate": 0.0001, "loss": 4.3745, "loss/crossentropy": 2.234758734703064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19758003950119019, "step": 21564 }, { "epoch": 0.43132, "grad_norm": 1.9609375, "grad_norm_var": 0.00819091796875, "learning_rate": 0.0001, "loss": 4.2822, "loss/crossentropy": 2.126620352268219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20688238739967346, "step": 21566 }, { "epoch": 0.43136, "grad_norm": 1.984375, "grad_norm_var": 0.008084869384765625, "learning_rate": 0.0001, "loss": 4.1531, "loss/crossentropy": 1.9397594332695007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18199825286865234, "step": 21568 }, { "epoch": 0.4314, "grad_norm": 2.0625, "grad_norm_var": 0.0080322265625, "learning_rate": 0.0001, "loss": 4.1644, "loss/crossentropy": 2.1395343542099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074909508228302, "step": 21570 }, { "epoch": 0.43144, "grad_norm": 1.8359375, "grad_norm_var": 0.010827382405598959, "learning_rate": 0.0001, "loss": 3.7397, "loss/crossentropy": 1.9207513332366943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881982833147049, "step": 21572 }, { "epoch": 0.43148, "grad_norm": 2.34375, "grad_norm_var": 0.01856053670247396, "learning_rate": 0.0001, "loss": 4.1079, "loss/crossentropy": 2.0409696102142334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21291138231754303, "step": 21574 }, { "epoch": 0.43152, "grad_norm": 1.953125, "grad_norm_var": 0.02153498331705729, "learning_rate": 0.0001, "loss": 4.0419, "loss/crossentropy": 2.0753902792930603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20471004396677017, "step": 21576 }, { "epoch": 0.43156, "grad_norm": 1.875, "grad_norm_var": 0.022321573893229165, "learning_rate": 0.0001, "loss": 4.1617, "loss/crossentropy": 2.412545084953308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21189319342374802, "step": 21578 }, { "epoch": 0.4316, "grad_norm": 1.875, "grad_norm_var": 0.019850413004557293, "learning_rate": 0.0001, "loss": 3.9468, "loss/crossentropy": 1.9271405339241028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18710750341415405, "step": 21580 }, { "epoch": 0.43164, "grad_norm": 2.0, "grad_norm_var": 0.019321441650390625, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.000905692577362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17429713159799576, "step": 21582 }, { "epoch": 0.43168, "grad_norm": 1.9140625, "grad_norm_var": 0.019334920247395835, "learning_rate": 0.0001, "loss": 3.8559, "loss/crossentropy": 1.9592137932777405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189944788813591, "step": 21584 }, { "epoch": 0.43172, "grad_norm": 1.953125, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 4.1447, "loss/crossentropy": 2.283234119415283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18164613842964172, "step": 21586 }, { "epoch": 0.43176, "grad_norm": 1.9921875, "grad_norm_var": 0.0160064697265625, "learning_rate": 0.0001, "loss": 4.0019, "loss/crossentropy": 2.1789008378982544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087143510580063, "step": 21588 }, { "epoch": 0.4318, "grad_norm": 1.875, "grad_norm_var": 0.009000396728515625, "learning_rate": 0.0001, "loss": 4.0039, "loss/crossentropy": 2.192038893699646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19432441890239716, "step": 21590 }, { "epoch": 0.43184, "grad_norm": 1.84375, "grad_norm_var": 0.005130767822265625, "learning_rate": 0.0001, "loss": 4.0142, "loss/crossentropy": 1.8618733286857605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18207471072673798, "step": 21592 }, { "epoch": 0.43188, "grad_norm": 2.078125, "grad_norm_var": 0.006186676025390625, "learning_rate": 0.0001, "loss": 4.3056, "loss/crossentropy": 2.423651933670044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21819397807121277, "step": 21594 }, { "epoch": 0.43192, "grad_norm": 1.984375, "grad_norm_var": 0.005765533447265625, "learning_rate": 0.0001, "loss": 4.0506, "loss/crossentropy": 1.7313976287841797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16011521220207214, "step": 21596 }, { "epoch": 0.43196, "grad_norm": 1.9140625, "grad_norm_var": 0.0051043192545572914, "learning_rate": 0.0001, "loss": 4.0915, "loss/crossentropy": 2.089974284172058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20138490200042725, "step": 21598 }, { "epoch": 0.432, "grad_norm": 2.078125, "grad_norm_var": 0.018895467122395832, "learning_rate": 0.0001, "loss": 4.3367, "loss/crossentropy": 1.9697930812835693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22859974950551987, "step": 21600 }, { "epoch": 0.43204, "grad_norm": 1.8671875, "grad_norm_var": 0.019832356770833334, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 1.7605210542678833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17675022035837173, "step": 21602 }, { "epoch": 0.43208, "grad_norm": 1.953125, "grad_norm_var": 0.019755045572916668, "learning_rate": 0.0001, "loss": 3.9467, "loss/crossentropy": 1.9884806275367737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19690094888210297, "step": 21604 }, { "epoch": 0.43212, "grad_norm": 1.9375, "grad_norm_var": 0.018521881103515624, "learning_rate": 0.0001, "loss": 3.7705, "loss/crossentropy": 1.987808346748352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20262162387371063, "step": 21606 }, { "epoch": 0.43216, "grad_norm": 1.7578125, "grad_norm_var": 0.026105753580729165, "learning_rate": 0.0001, "loss": 3.683, "loss/crossentropy": 2.220315098762512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899535208940506, "step": 21608 }, { "epoch": 0.4322, "grad_norm": 1.90625, "grad_norm_var": 0.024347941080729168, "learning_rate": 0.0001, "loss": 4.0442, "loss/crossentropy": 2.245596408843994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21112707257270813, "step": 21610 }, { "epoch": 0.43224, "grad_norm": 1.90625, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 4.2442, "loss/crossentropy": 2.3593950271606445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22111155092716217, "step": 21612 }, { "epoch": 0.43228, "grad_norm": 1.9453125, "grad_norm_var": 0.025721995035807292, "learning_rate": 0.0001, "loss": 3.7807, "loss/crossentropy": 1.7467412948608398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16833512485027313, "step": 21614 }, { "epoch": 0.43232, "grad_norm": 1.7578125, "grad_norm_var": 0.011328125, "learning_rate": 0.0001, "loss": 3.815, "loss/crossentropy": 2.160380005836487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22670412063598633, "step": 21616 }, { "epoch": 0.43236, "grad_norm": 1.8125, "grad_norm_var": 0.012007395426432291, "learning_rate": 0.0001, "loss": 3.9665, "loss/crossentropy": 2.1650896072387695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19781000167131424, "step": 21618 }, { "epoch": 0.4324, "grad_norm": 1.875, "grad_norm_var": 0.011987050374348959, "learning_rate": 0.0001, "loss": 3.8044, "loss/crossentropy": 2.028198480606079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905156373977661, "step": 21620 }, { "epoch": 0.43244, "grad_norm": 1.859375, "grad_norm_var": 0.014241282145182292, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 2.30538147687912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474938094615936, "step": 21622 }, { "epoch": 0.43248, "grad_norm": 1.90625, "grad_norm_var": 0.009450022379557292, "learning_rate": 0.0001, "loss": 3.9623, "loss/crossentropy": 1.8054092526435852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1670922413468361, "step": 21624 }, { "epoch": 0.43252, "grad_norm": 1.921875, "grad_norm_var": 0.011578114827473958, "learning_rate": 0.0001, "loss": 4.2168, "loss/crossentropy": 1.9821457862854004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19581317901611328, "step": 21626 }, { "epoch": 0.43256, "grad_norm": 1.8671875, "grad_norm_var": 0.011197916666666667, "learning_rate": 0.0001, "loss": 3.9882, "loss/crossentropy": 2.2404789328575134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19181591272354126, "step": 21628 }, { "epoch": 0.4326, "grad_norm": 1.9453125, "grad_norm_var": 0.013557942708333333, "learning_rate": 0.0001, "loss": 4.0683, "loss/crossentropy": 2.0229339003562927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2553243041038513, "step": 21630 }, { "epoch": 0.43264, "grad_norm": 1.984375, "grad_norm_var": 0.011122385660807291, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 2.013065457344055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24048484861850739, "step": 21632 }, { "epoch": 0.43268, "grad_norm": 1.8671875, "grad_norm_var": 0.009173329671223958, "learning_rate": 0.0001, "loss": 3.9129, "loss/crossentropy": 1.9411469101905823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19341549277305603, "step": 21634 }, { "epoch": 0.43272, "grad_norm": 1.953125, "grad_norm_var": 0.008558909098307291, "learning_rate": 0.0001, "loss": 4.0984, "loss/crossentropy": 2.1277971267700195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19953395426273346, "step": 21636 }, { "epoch": 0.43276, "grad_norm": 2.015625, "grad_norm_var": 0.0071795145670572914, "learning_rate": 0.0001, "loss": 4.2077, "loss/crossentropy": 2.1296733617782593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117133647203445, "step": 21638 }, { "epoch": 0.4328, "grad_norm": 2.109375, "grad_norm_var": 0.009376780192057291, "learning_rate": 0.0001, "loss": 3.8594, "loss/crossentropy": 1.6910184025764465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17567047476768494, "step": 21640 }, { "epoch": 0.43284, "grad_norm": 1.8046875, "grad_norm_var": 0.009845987955729166, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 1.9889350533485413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18229827284812927, "step": 21642 }, { "epoch": 0.43288, "grad_norm": 1.9140625, "grad_norm_var": 0.008931223551432292, "learning_rate": 0.0001, "loss": 3.8053, "loss/crossentropy": 1.885926902294159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18047264218330383, "step": 21644 }, { "epoch": 0.43292, "grad_norm": 1.9296875, "grad_norm_var": 0.006471506754557292, "learning_rate": 0.0001, "loss": 3.9383, "loss/crossentropy": 1.9706445336341858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18135320395231247, "step": 21646 }, { "epoch": 0.43296, "grad_norm": 1.9921875, "grad_norm_var": 0.007127888997395833, "learning_rate": 0.0001, "loss": 3.8972, "loss/crossentropy": 1.7751979231834412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1693567857146263, "step": 21648 }, { "epoch": 0.433, "grad_norm": 1.9140625, "grad_norm_var": 0.006884511311848958, "learning_rate": 0.0001, "loss": 4.0094, "loss/crossentropy": 2.025897264480591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18943700194358826, "step": 21650 }, { "epoch": 0.43304, "grad_norm": 1.84375, "grad_norm_var": 0.007627105712890625, "learning_rate": 0.0001, "loss": 3.8503, "loss/crossentropy": 1.7019765973091125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1738589182496071, "step": 21652 }, { "epoch": 0.43308, "grad_norm": 2.140625, "grad_norm_var": 0.010225168863932292, "learning_rate": 0.0001, "loss": 4.1418, "loss/crossentropy": 1.8927075266838074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1616983264684677, "step": 21654 }, { "epoch": 0.43312, "grad_norm": 2.09375, "grad_norm_var": 0.012336222330729167, "learning_rate": 0.0001, "loss": 4.3299, "loss/crossentropy": 2.2224843502044678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21432733535766602, "step": 21656 }, { "epoch": 0.43316, "grad_norm": 2.21875, "grad_norm_var": 0.01502685546875, "learning_rate": 0.0001, "loss": 4.1572, "loss/crossentropy": 2.1968295574188232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20150140672922134, "step": 21658 }, { "epoch": 0.4332, "grad_norm": 2.046875, "grad_norm_var": 0.015057118733723958, "learning_rate": 0.0001, "loss": 4.0046, "loss/crossentropy": 2.1984314918518066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21520624309778214, "step": 21660 }, { "epoch": 0.43324, "grad_norm": 2.015625, "grad_norm_var": 0.01613133748372396, "learning_rate": 0.0001, "loss": 4.0638, "loss/crossentropy": 2.183030843734741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994212418794632, "step": 21662 }, { "epoch": 0.43328, "grad_norm": 1.984375, "grad_norm_var": 0.016353352864583334, "learning_rate": 0.0001, "loss": 4.108, "loss/crossentropy": 2.0716358423233032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18969116359949112, "step": 21664 }, { "epoch": 0.43332, "grad_norm": 1.921875, "grad_norm_var": 0.016043853759765626, "learning_rate": 0.0001, "loss": 4.0988, "loss/crossentropy": 1.8427326679229736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19842079281806946, "step": 21666 }, { "epoch": 0.43336, "grad_norm": 1.9921875, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 4.0744, "loss/crossentropy": 2.3021219968795776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21595575660467148, "step": 21668 }, { "epoch": 0.4334, "grad_norm": 1.8984375, "grad_norm_var": 0.01593195597330729, "learning_rate": 0.0001, "loss": 3.8216, "loss/crossentropy": 2.2828067541122437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21141938865184784, "step": 21670 }, { "epoch": 0.43344, "grad_norm": 2.0625, "grad_norm_var": 0.014955393473307292, "learning_rate": 0.0001, "loss": 4.0502, "loss/crossentropy": 2.167429566383362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22402016073465347, "step": 21672 }, { "epoch": 0.43348, "grad_norm": 1.9609375, "grad_norm_var": 0.010632069905598958, "learning_rate": 0.0001, "loss": 3.9943, "loss/crossentropy": 2.02812397480011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061637043952942, "step": 21674 }, { "epoch": 0.43352, "grad_norm": 1.7109375, "grad_norm_var": 0.013724517822265626, "learning_rate": 0.0001, "loss": 3.6925, "loss/crossentropy": 1.96470308303833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.168222576379776, "step": 21676 }, { "epoch": 0.43356, "grad_norm": 2.03125, "grad_norm_var": 0.013163248697916666, "learning_rate": 0.0001, "loss": 4.1567, "loss/crossentropy": 2.640696406364441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22549600154161453, "step": 21678 }, { "epoch": 0.4336, "grad_norm": 1.8515625, "grad_norm_var": 0.010058339436848958, "learning_rate": 0.0001, "loss": 4.0939, "loss/crossentropy": 2.140254855155945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973073035478592, "step": 21680 }, { "epoch": 0.43364, "grad_norm": 1.921875, "grad_norm_var": 0.0110595703125, "learning_rate": 0.0001, "loss": 4.1028, "loss/crossentropy": 2.3025078773498535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19433188438415527, "step": 21682 }, { "epoch": 0.43368, "grad_norm": 2.015625, "grad_norm_var": 0.012452952067057292, "learning_rate": 0.0001, "loss": 4.344, "loss/crossentropy": 2.1558977365493774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20599104464054108, "step": 21684 }, { "epoch": 0.43372, "grad_norm": 1.9609375, "grad_norm_var": 0.014780426025390625, "learning_rate": 0.0001, "loss": 3.7918, "loss/crossentropy": 1.8071665167808533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18443845212459564, "step": 21686 }, { "epoch": 0.43376, "grad_norm": 1.984375, "grad_norm_var": 0.018477121988932293, "learning_rate": 0.0001, "loss": 4.0928, "loss/crossentropy": 2.41480815410614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22098398208618164, "step": 21688 }, { "epoch": 0.4338, "grad_norm": 1.8828125, "grad_norm_var": 0.019551595052083332, "learning_rate": 0.0001, "loss": 3.9046, "loss/crossentropy": 2.0506592988967896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17161494493484497, "step": 21690 }, { "epoch": 0.43384, "grad_norm": 1.9453125, "grad_norm_var": 0.018187459309895834, "learning_rate": 0.0001, "loss": 3.9579, "loss/crossentropy": 2.170476496219635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19826646149158478, "step": 21692 }, { "epoch": 0.43388, "grad_norm": 1.7890625, "grad_norm_var": 0.019665273030598958, "learning_rate": 0.0001, "loss": 3.9356, "loss/crossentropy": 1.6374267935752869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1581001877784729, "step": 21694 }, { "epoch": 0.43392, "grad_norm": 1.90625, "grad_norm_var": 0.018944295247395833, "learning_rate": 0.0001, "loss": 3.9333, "loss/crossentropy": 1.9059698581695557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18373095244169235, "step": 21696 }, { "epoch": 0.43396, "grad_norm": 1.9765625, "grad_norm_var": 0.01762669881184896, "learning_rate": 0.0001, "loss": 4.2217, "loss/crossentropy": 2.238908290863037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044297158718109, "step": 21698 }, { "epoch": 0.434, "grad_norm": 2.015625, "grad_norm_var": 0.01877415974934896, "learning_rate": 0.0001, "loss": 3.6318, "loss/crossentropy": 1.7991122007369995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18948530405759811, "step": 21700 }, { "epoch": 0.43404, "grad_norm": 1.828125, "grad_norm_var": 0.01668268839518229, "learning_rate": 0.0001, "loss": 3.754, "loss/crossentropy": 1.773992896080017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16768701374530792, "step": 21702 }, { "epoch": 0.43408, "grad_norm": 2.09375, "grad_norm_var": 0.013051096598307292, "learning_rate": 0.0001, "loss": 4.325, "loss/crossentropy": 2.021883547306061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20071732252836227, "step": 21704 }, { "epoch": 0.43412, "grad_norm": 1.9765625, "grad_norm_var": 0.0167144775390625, "learning_rate": 0.0001, "loss": 4.3162, "loss/crossentropy": 2.2198556661605835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20410272479057312, "step": 21706 }, { "epoch": 0.43416, "grad_norm": 1.8984375, "grad_norm_var": 0.013986968994140625, "learning_rate": 0.0001, "loss": 4.0144, "loss/crossentropy": 2.239061653614044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614881813526154, "step": 21708 }, { "epoch": 0.4342, "grad_norm": 2.25, "grad_norm_var": 0.01849543253580729, "learning_rate": 0.0001, "loss": 4.0531, "loss/crossentropy": 1.720937430858612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19312319159507751, "step": 21710 }, { "epoch": 0.43424, "grad_norm": 1.84375, "grad_norm_var": 0.01881103515625, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 2.577925205230713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20975439995527267, "step": 21712 }, { "epoch": 0.43428, "grad_norm": 2.109375, "grad_norm_var": 0.019634755452473958, "learning_rate": 0.0001, "loss": 4.2833, "loss/crossentropy": 2.3273919820785522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203931525349617, "step": 21714 }, { "epoch": 0.43432, "grad_norm": 1.8671875, "grad_norm_var": 0.017316691080729165, "learning_rate": 0.0001, "loss": 3.7922, "loss/crossentropy": 1.6773480772972107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1546436920762062, "step": 21716 }, { "epoch": 0.43436, "grad_norm": 1.875, "grad_norm_var": 0.015986887613932292, "learning_rate": 0.0001, "loss": 4.0342, "loss/crossentropy": 2.4566088914871216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2312512993812561, "step": 21718 }, { "epoch": 0.4344, "grad_norm": 1.859375, "grad_norm_var": 0.015998331705729167, "learning_rate": 0.0001, "loss": 3.5272, "loss/crossentropy": 1.8777849674224854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18316396325826645, "step": 21720 }, { "epoch": 0.43444, "grad_norm": 1.9375, "grad_norm_var": 0.011744944254557292, "learning_rate": 0.0001, "loss": 4.0373, "loss/crossentropy": 2.131688177585602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945742592215538, "step": 21722 }, { "epoch": 0.43448, "grad_norm": 2.0, "grad_norm_var": 0.0122222900390625, "learning_rate": 0.0001, "loss": 3.9186, "loss/crossentropy": 1.9808812141418457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051435261964798, "step": 21724 }, { "epoch": 0.43452, "grad_norm": 2.171875, "grad_norm_var": 0.009037017822265625, "learning_rate": 0.0001, "loss": 3.9247, "loss/crossentropy": 2.1240362524986267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22029808908700943, "step": 21726 }, { "epoch": 0.43456, "grad_norm": 1.9375, "grad_norm_var": 0.0087554931640625, "learning_rate": 0.0001, "loss": 4.1359, "loss/crossentropy": 2.2637228965759277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24003028869628906, "step": 21728 }, { "epoch": 0.4346, "grad_norm": 1.84375, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 3.7561, "loss/crossentropy": 2.0770394802093506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.185364231467247, "step": 21730 }, { "epoch": 0.43464, "grad_norm": 1.9921875, "grad_norm_var": 0.0071197509765625, "learning_rate": 0.0001, "loss": 4.1621, "loss/crossentropy": 2.1420366764068604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21707475185394287, "step": 21732 }, { "epoch": 0.43468, "grad_norm": 1.8984375, "grad_norm_var": 0.006940714518229167, "learning_rate": 0.0001, "loss": 4.0836, "loss/crossentropy": 1.8017431497573853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1667446494102478, "step": 21734 }, { "epoch": 0.43472, "grad_norm": 1.84375, "grad_norm_var": 0.007490793863932292, "learning_rate": 0.0001, "loss": 4.0098, "loss/crossentropy": 2.215123176574707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24778465926647186, "step": 21736 }, { "epoch": 0.43476, "grad_norm": 1.8046875, "grad_norm_var": 0.009708658854166666, "learning_rate": 0.0001, "loss": 3.7596, "loss/crossentropy": 1.7275863289833069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18431701511144638, "step": 21738 }, { "epoch": 0.4348, "grad_norm": 2.0, "grad_norm_var": 0.009643300374348959, "learning_rate": 0.0001, "loss": 4.1243, "loss/crossentropy": 2.0645371675491333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19622839987277985, "step": 21740 }, { "epoch": 0.43484, "grad_norm": 1.84375, "grad_norm_var": 0.006563059488932292, "learning_rate": 0.0001, "loss": 3.9028, "loss/crossentropy": 1.8695591688156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18317092955112457, "step": 21742 }, { "epoch": 0.43488, "grad_norm": 2.046875, "grad_norm_var": 0.010239410400390624, "learning_rate": 0.0001, "loss": 4.1963, "loss/crossentropy": 2.090248942375183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20329977571964264, "step": 21744 }, { "epoch": 0.43492, "grad_norm": 1.875, "grad_norm_var": 0.011307779947916667, "learning_rate": 0.0001, "loss": 3.6347, "loss/crossentropy": 1.943733274936676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20422999560832977, "step": 21746 }, { "epoch": 0.43496, "grad_norm": 1.8203125, "grad_norm_var": 0.012483723958333333, "learning_rate": 0.0001, "loss": 4.0101, "loss/crossentropy": 2.2485480308532715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20576569437980652, "step": 21748 }, { "epoch": 0.435, "grad_norm": 1.953125, "grad_norm_var": 0.012569173177083334, "learning_rate": 0.0001, "loss": 4.1053, "loss/crossentropy": 2.184234142303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19477711617946625, "step": 21750 }, { "epoch": 0.43504, "grad_norm": 1.9609375, "grad_norm_var": 0.011311848958333334, "learning_rate": 0.0001, "loss": 4.1796, "loss/crossentropy": 2.1742812991142273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19085688889026642, "step": 21752 }, { "epoch": 0.43508, "grad_norm": 1.9296875, "grad_norm_var": 0.010985310872395833, "learning_rate": 0.0001, "loss": 4.1924, "loss/crossentropy": 2.44822895526886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128421813249588, "step": 21754 }, { "epoch": 0.43512, "grad_norm": 1.96875, "grad_norm_var": 0.01138916015625, "learning_rate": 0.0001, "loss": 4.1137, "loss/crossentropy": 1.8242409229278564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19045840948820114, "step": 21756 }, { "epoch": 0.43516, "grad_norm": 2.0, "grad_norm_var": 0.012474568684895833, "learning_rate": 0.0001, "loss": 4.0617, "loss/crossentropy": 2.1707061529159546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152707874774933, "step": 21758 }, { "epoch": 0.4352, "grad_norm": 1.8671875, "grad_norm_var": 0.010672760009765626, "learning_rate": 0.0001, "loss": 4.045, "loss/crossentropy": 2.0790776014328003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19175638258457184, "step": 21760 }, { "epoch": 0.43524, "grad_norm": 1.828125, "grad_norm_var": 0.0095947265625, "learning_rate": 0.0001, "loss": 3.8814, "loss/crossentropy": 1.9847996830940247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18776872754096985, "step": 21762 }, { "epoch": 0.43528, "grad_norm": 2.125, "grad_norm_var": 0.0096435546875, "learning_rate": 0.0001, "loss": 4.0754, "loss/crossentropy": 2.122888207435608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20312964171171188, "step": 21764 }, { "epoch": 0.43532, "grad_norm": 1.8984375, "grad_norm_var": 0.0095855712890625, "learning_rate": 0.0001, "loss": 4.035, "loss/crossentropy": 2.1922764778137207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2057497799396515, "step": 21766 }, { "epoch": 0.43536, "grad_norm": 1.7421875, "grad_norm_var": 0.013321685791015624, "learning_rate": 0.0001, "loss": 3.8096, "loss/crossentropy": 1.392991840839386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1425289735198021, "step": 21768 }, { "epoch": 0.4354, "grad_norm": 2.03125, "grad_norm_var": 0.012115224202473959, "learning_rate": 0.0001, "loss": 4.1758, "loss/crossentropy": 1.9627132415771484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007637917995453, "step": 21770 }, { "epoch": 0.43544, "grad_norm": 1.96875, "grad_norm_var": 0.011374664306640626, "learning_rate": 0.0001, "loss": 4.0094, "loss/crossentropy": 2.04905503988266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18171894550323486, "step": 21772 }, { "epoch": 0.43548, "grad_norm": 1.828125, "grad_norm_var": 0.0086090087890625, "learning_rate": 0.0001, "loss": 3.7465, "loss/crossentropy": 2.09994637966156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924375221133232, "step": 21774 }, { "epoch": 0.43552, "grad_norm": 1.875, "grad_norm_var": 0.008565012613932292, "learning_rate": 0.0001, "loss": 4.2903, "loss/crossentropy": 2.381268620491028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22655388712882996, "step": 21776 }, { "epoch": 0.43556, "grad_norm": 1.9765625, "grad_norm_var": 0.008161417643229167, "learning_rate": 0.0001, "loss": 4.151, "loss/crossentropy": 2.189077138900757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19720469415187836, "step": 21778 }, { "epoch": 0.4356, "grad_norm": 1.9140625, "grad_norm_var": 0.005381011962890625, "learning_rate": 0.0001, "loss": 3.7407, "loss/crossentropy": 1.691848337650299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1633242815732956, "step": 21780 }, { "epoch": 0.43564, "grad_norm": 1.9296875, "grad_norm_var": 0.007054646809895833, "learning_rate": 0.0001, "loss": 4.0392, "loss/crossentropy": 2.170955538749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20698139071464539, "step": 21782 }, { "epoch": 0.43568, "grad_norm": 1.9921875, "grad_norm_var": 0.00499267578125, "learning_rate": 0.0001, "loss": 3.9417, "loss/crossentropy": 2.0300655364990234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1849035769701004, "step": 21784 }, { "epoch": 0.43572, "grad_norm": 1.984375, "grad_norm_var": 0.004443359375, "learning_rate": 0.0001, "loss": 3.9352, "loss/crossentropy": 1.998863697052002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1871778443455696, "step": 21786 }, { "epoch": 0.43576, "grad_norm": 1.8984375, "grad_norm_var": 0.004257965087890625, "learning_rate": 0.0001, "loss": 3.9628, "loss/crossentropy": 1.7537881731987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.178309828042984, "step": 21788 }, { "epoch": 0.4358, "grad_norm": 1.8828125, "grad_norm_var": 0.003916168212890625, "learning_rate": 0.0001, "loss": 3.932, "loss/crossentropy": 1.9158823490142822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18895266950130463, "step": 21790 }, { "epoch": 0.43584, "grad_norm": 2.0, "grad_norm_var": 0.0035845438639322915, "learning_rate": 0.0001, "loss": 4.0875, "loss/crossentropy": 2.177749514579773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20137280225753784, "step": 21792 }, { "epoch": 0.43588, "grad_norm": 1.953125, "grad_norm_var": 0.004788970947265625, "learning_rate": 0.0001, "loss": 4.1789, "loss/crossentropy": 2.132981538772583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20100867003202438, "step": 21794 }, { "epoch": 0.43592, "grad_norm": 1.9296875, "grad_norm_var": 0.004564412434895833, "learning_rate": 0.0001, "loss": 3.9301, "loss/crossentropy": 1.926209807395935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19080301374197006, "step": 21796 }, { "epoch": 0.43596, "grad_norm": 1.8828125, "grad_norm_var": 0.0040891011555989586, "learning_rate": 0.0001, "loss": 4.1137, "loss/crossentropy": 2.1029208302497864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002335861325264, "step": 21798 }, { "epoch": 0.436, "grad_norm": 1.9375, "grad_norm_var": 0.0036333719889322918, "learning_rate": 0.0001, "loss": 4.1378, "loss/crossentropy": 2.0101813673973083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301408112049103, "step": 21800 }, { "epoch": 0.43604, "grad_norm": 1.796875, "grad_norm_var": 0.005250803629557292, "learning_rate": 0.0001, "loss": 3.8085, "loss/crossentropy": 2.2096749544143677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20425067842006683, "step": 21802 }, { "epoch": 0.43608, "grad_norm": 1.8828125, "grad_norm_var": 0.005236562093098958, "learning_rate": 0.0001, "loss": 3.9631, "loss/crossentropy": 1.7085874676704407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17989665269851685, "step": 21804 }, { "epoch": 0.43612, "grad_norm": 1.7890625, "grad_norm_var": 0.006086222330729167, "learning_rate": 0.0001, "loss": 4.2035, "loss/crossentropy": 2.082730233669281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19857076555490494, "step": 21806 }, { "epoch": 0.43616, "grad_norm": 1.96875, "grad_norm_var": 0.0062164306640625, "learning_rate": 0.0001, "loss": 4.0634, "loss/crossentropy": 2.0145097970962524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956789717078209, "step": 21808 }, { "epoch": 0.4362, "grad_norm": 1.953125, "grad_norm_var": 0.0051666259765625, "learning_rate": 0.0001, "loss": 4.31, "loss/crossentropy": 2.0774065256118774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20803070068359375, "step": 21810 }, { "epoch": 0.43624, "grad_norm": 1.9765625, "grad_norm_var": 0.004839833577473958, "learning_rate": 0.0001, "loss": 3.9442, "loss/crossentropy": 2.045994222164154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19481085240840912, "step": 21812 }, { "epoch": 0.43628, "grad_norm": 2.0, "grad_norm_var": 0.005570220947265625, "learning_rate": 0.0001, "loss": 4.119, "loss/crossentropy": 1.9371901154518127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20281973481178284, "step": 21814 }, { "epoch": 0.43632, "grad_norm": 2.046875, "grad_norm_var": 0.00648193359375, "learning_rate": 0.0001, "loss": 3.8379, "loss/crossentropy": 2.1048339009284973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919357031583786, "step": 21816 }, { "epoch": 0.43636, "grad_norm": 1.921875, "grad_norm_var": 0.005116526285807292, "learning_rate": 0.0001, "loss": 4.0696, "loss/crossentropy": 2.162278175354004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19293204694986343, "step": 21818 }, { "epoch": 0.4364, "grad_norm": 1.859375, "grad_norm_var": 0.006034342447916666, "learning_rate": 0.0001, "loss": 4.0294, "loss/crossentropy": 1.9012067317962646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19044938683509827, "step": 21820 }, { "epoch": 0.43644, "grad_norm": 1.8671875, "grad_norm_var": 0.004583485921223958, "learning_rate": 0.0001, "loss": 3.8538, "loss/crossentropy": 1.617782175540924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1641436219215393, "step": 21822 }, { "epoch": 0.43648, "grad_norm": 2.109375, "grad_norm_var": 0.006048329671223958, "learning_rate": 0.0001, "loss": 4.2229, "loss/crossentropy": 2.136145055294037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20208875089883804, "step": 21824 }, { "epoch": 0.43652, "grad_norm": 2.0625, "grad_norm_var": 0.006475575764973958, "learning_rate": 0.0001, "loss": 4.0446, "loss/crossentropy": 2.1240354776382446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052341103553772, "step": 21826 }, { "epoch": 0.43656, "grad_norm": 1.8671875, "grad_norm_var": 0.007020823160807292, "learning_rate": 0.0001, "loss": 4.0295, "loss/crossentropy": 1.8771272897720337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17846016585826874, "step": 21828 }, { "epoch": 0.4366, "grad_norm": 1.9921875, "grad_norm_var": 0.005985260009765625, "learning_rate": 0.0001, "loss": 3.9318, "loss/crossentropy": 1.8061035871505737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1792590245604515, "step": 21830 }, { "epoch": 0.43664, "grad_norm": 1.953125, "grad_norm_var": 0.0055084228515625, "learning_rate": 0.0001, "loss": 4.0204, "loss/crossentropy": 2.0216365456581116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18977319449186325, "step": 21832 }, { "epoch": 0.43668, "grad_norm": 1.75, "grad_norm_var": 0.008056386311848959, "learning_rate": 0.0001, "loss": 3.7667, "loss/crossentropy": 1.81255304813385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1740209013223648, "step": 21834 }, { "epoch": 0.43672, "grad_norm": 1.7890625, "grad_norm_var": 0.0087310791015625, "learning_rate": 0.0001, "loss": 4.0626, "loss/crossentropy": 2.2982383966445923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20780136436223984, "step": 21836 }, { "epoch": 0.43676, "grad_norm": 2.078125, "grad_norm_var": 0.010037994384765625, "learning_rate": 0.0001, "loss": 3.9891, "loss/crossentropy": 2.0461641550064087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21413680911064148, "step": 21838 }, { "epoch": 0.4368, "grad_norm": 1.921875, "grad_norm_var": 0.008341217041015625, "learning_rate": 0.0001, "loss": 4.1413, "loss/crossentropy": 2.1272863149642944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22310787439346313, "step": 21840 }, { "epoch": 0.43684, "grad_norm": 1.8359375, "grad_norm_var": 0.008133951822916667, "learning_rate": 0.0001, "loss": 3.9467, "loss/crossentropy": 1.9026488661766052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18013561517000198, "step": 21842 }, { "epoch": 0.43688, "grad_norm": 1.921875, "grad_norm_var": 0.007635243733723958, "learning_rate": 0.0001, "loss": 4.2739, "loss/crossentropy": 2.1403396129608154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20329253375530243, "step": 21844 }, { "epoch": 0.43692, "grad_norm": 1.9375, "grad_norm_var": 0.0075103759765625, "learning_rate": 0.0001, "loss": 4.1922, "loss/crossentropy": 2.041237533092499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20238220691680908, "step": 21846 }, { "epoch": 0.43696, "grad_norm": 1.8515625, "grad_norm_var": 0.008011881510416667, "learning_rate": 0.0001, "loss": 4.2067, "loss/crossentropy": 1.860413134098053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19062548130750656, "step": 21848 }, { "epoch": 0.437, "grad_norm": 1.9609375, "grad_norm_var": 0.007721964518229167, "learning_rate": 0.0001, "loss": 3.962, "loss/crossentropy": 2.1432868242263794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947617009282112, "step": 21850 }, { "epoch": 0.43704, "grad_norm": 1.8515625, "grad_norm_var": 0.007834625244140626, "learning_rate": 0.0001, "loss": 3.667, "loss/crossentropy": 1.5292840003967285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1465652659535408, "step": 21852 }, { "epoch": 0.43708, "grad_norm": 1.90625, "grad_norm_var": 0.00523681640625, "learning_rate": 0.0001, "loss": 3.7604, "loss/crossentropy": 1.8826366066932678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19239384680986404, "step": 21854 }, { "epoch": 0.43712, "grad_norm": 1.890625, "grad_norm_var": 0.005086008707682292, "learning_rate": 0.0001, "loss": 4.0817, "loss/crossentropy": 1.7915772199630737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18841900676488876, "step": 21856 }, { "epoch": 0.43716, "grad_norm": 1.9296875, "grad_norm_var": 0.0048868815104166664, "learning_rate": 0.0001, "loss": 3.7986, "loss/crossentropy": 1.9425716400146484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19191939383745193, "step": 21858 }, { "epoch": 0.4372, "grad_norm": 1.84375, "grad_norm_var": 0.006131744384765625, "learning_rate": 0.0001, "loss": 3.8735, "loss/crossentropy": 2.073697865009308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19094493240118027, "step": 21860 }, { "epoch": 0.43724, "grad_norm": 1.9453125, "grad_norm_var": 0.046575673421223956, "learning_rate": 0.0001, "loss": 4.0953, "loss/crossentropy": 2.023917257785797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218250036239624, "step": 21862 }, { "epoch": 0.43728, "grad_norm": 1.96875, "grad_norm_var": 0.04965413411458333, "learning_rate": 0.0001, "loss": 4.1522, "loss/crossentropy": 1.9710112810134888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21274644136428833, "step": 21864 }, { "epoch": 0.43732, "grad_norm": 1.953125, "grad_norm_var": 0.04736328125, "learning_rate": 0.0001, "loss": 4.0579, "loss/crossentropy": 1.9758725762367249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1855839416384697, "step": 21866 }, { "epoch": 0.43736, "grad_norm": 2.15625, "grad_norm_var": 0.04616597493489583, "learning_rate": 0.0001, "loss": 4.0144, "loss/crossentropy": 2.0672810077667236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20418523997068405, "step": 21868 }, { "epoch": 0.4374, "grad_norm": 1.921875, "grad_norm_var": 0.046000162760416664, "learning_rate": 0.0001, "loss": 3.6892, "loss/crossentropy": 1.8937869668006897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18653447926044464, "step": 21870 }, { "epoch": 0.43744, "grad_norm": 1.8515625, "grad_norm_var": 0.049006144205729164, "learning_rate": 0.0001, "loss": 3.8814, "loss/crossentropy": 1.6613503694534302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16891123354434967, "step": 21872 }, { "epoch": 0.43748, "grad_norm": 2.0, "grad_norm_var": 0.04822489420572917, "learning_rate": 0.0001, "loss": 4.2159, "loss/crossentropy": 2.007514178752899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2078787311911583, "step": 21874 }, { "epoch": 0.43752, "grad_norm": 1.953125, "grad_norm_var": 0.043603515625, "learning_rate": 0.0001, "loss": 3.831, "loss/crossentropy": 1.913162350654602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20729011297225952, "step": 21876 }, { "epoch": 0.43756, "grad_norm": 2.03125, "grad_norm_var": 0.012849680582682292, "learning_rate": 0.0001, "loss": 3.9612, "loss/crossentropy": 2.1534116864204407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183125138282776, "step": 21878 }, { "epoch": 0.4376, "grad_norm": 1.9296875, "grad_norm_var": 0.012841542561848959, "learning_rate": 0.0001, "loss": 3.8101, "loss/crossentropy": 1.9248425960540771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19415219128131866, "step": 21880 }, { "epoch": 0.43764, "grad_norm": 1.921875, "grad_norm_var": 0.012790679931640625, "learning_rate": 0.0001, "loss": 3.7379, "loss/crossentropy": 2.095816135406494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181143820285797, "step": 21882 }, { "epoch": 0.43768, "grad_norm": 2.25, "grad_norm_var": 0.015405019124348959, "learning_rate": 0.0001, "loss": 4.6839, "loss/crossentropy": 2.0167580246925354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21327386051416397, "step": 21884 }, { "epoch": 0.43772, "grad_norm": 2.15625, "grad_norm_var": 0.019038899739583334, "learning_rate": 0.0001, "loss": 4.3355, "loss/crossentropy": 2.2425005435943604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20246200263500214, "step": 21886 }, { "epoch": 0.43776, "grad_norm": 2.0625, "grad_norm_var": 0.017232259114583332, "learning_rate": 0.0001, "loss": 4.3023, "loss/crossentropy": 1.9543325901031494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18824229389429092, "step": 21888 }, { "epoch": 0.4378, "grad_norm": 1.890625, "grad_norm_var": 0.017365519205729166, "learning_rate": 0.0001, "loss": 4.0802, "loss/crossentropy": 2.2031290531158447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20949723571538925, "step": 21890 }, { "epoch": 0.43784, "grad_norm": 1.8203125, "grad_norm_var": 0.01901219685872396, "learning_rate": 0.0001, "loss": 3.728, "loss/crossentropy": 1.97504061460495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19441132247447968, "step": 21892 }, { "epoch": 0.43788, "grad_norm": 1.78125, "grad_norm_var": 0.018888346354166665, "learning_rate": 0.0001, "loss": 3.8922, "loss/crossentropy": 1.8496057391166687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18915118277072906, "step": 21894 }, { "epoch": 0.43792, "grad_norm": 2.0, "grad_norm_var": 0.015116373697916666, "learning_rate": 0.0001, "loss": 4.0474, "loss/crossentropy": 1.9244802594184875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20679102092981339, "step": 21896 }, { "epoch": 0.43796, "grad_norm": 1.7890625, "grad_norm_var": 0.016902414957682292, "learning_rate": 0.0001, "loss": 3.9855, "loss/crossentropy": 2.1038661003112793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014932855963707, "step": 21898 }, { "epoch": 0.438, "grad_norm": 1.84375, "grad_norm_var": 0.011286417643229166, "learning_rate": 0.0001, "loss": 4.0416, "loss/crossentropy": 2.202482581138611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515280216932297, "step": 21900 }, { "epoch": 0.43804, "grad_norm": 1.84375, "grad_norm_var": 0.008353678385416667, "learning_rate": 0.0001, "loss": 4.0346, "loss/crossentropy": 2.1215643286705017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18430311977863312, "step": 21902 }, { "epoch": 0.43808, "grad_norm": 2.078125, "grad_norm_var": 0.0086822509765625, "learning_rate": 0.0001, "loss": 4.1243, "loss/crossentropy": 1.585806667804718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1615592986345291, "step": 21904 }, { "epoch": 0.43812, "grad_norm": 1.96875, "grad_norm_var": 0.008831532796223958, "learning_rate": 0.0001, "loss": 4.0288, "loss/crossentropy": 2.0006843209266663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003154665231705, "step": 21906 }, { "epoch": 0.43816, "grad_norm": 1.8984375, "grad_norm_var": 0.008571116129557292, "learning_rate": 0.0001, "loss": 3.8523, "loss/crossentropy": 1.9077118635177612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881355568766594, "step": 21908 }, { "epoch": 0.4382, "grad_norm": 1.890625, "grad_norm_var": 0.0080810546875, "learning_rate": 0.0001, "loss": 3.7186, "loss/crossentropy": 1.752367615699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17893216013908386, "step": 21910 }, { "epoch": 0.43824, "grad_norm": 1.9765625, "grad_norm_var": 0.0078765869140625, "learning_rate": 0.0001, "loss": 4.171, "loss/crossentropy": 2.1325889825820923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21129589527845383, "step": 21912 }, { "epoch": 0.43828, "grad_norm": 1.9609375, "grad_norm_var": 0.006949615478515625, "learning_rate": 0.0001, "loss": 4.3083, "loss/crossentropy": 2.630311131477356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224892295897007, "step": 21914 }, { "epoch": 0.43832, "grad_norm": 1.8828125, "grad_norm_var": 0.007549794514973959, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.002493679523468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19148597121238708, "step": 21916 }, { "epoch": 0.43836, "grad_norm": 2.0625, "grad_norm_var": 0.006571451822916667, "learning_rate": 0.0001, "loss": 4.1209, "loss/crossentropy": 1.968815267086029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18172769993543625, "step": 21918 }, { "epoch": 0.4384, "grad_norm": 1.9765625, "grad_norm_var": 0.005402628580729167, "learning_rate": 0.0001, "loss": 4.1595, "loss/crossentropy": 2.2519482374191284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22669588029384613, "step": 21920 }, { "epoch": 0.43844, "grad_norm": 1.9921875, "grad_norm_var": 0.0059315999348958336, "learning_rate": 0.0001, "loss": 3.9382, "loss/crossentropy": 1.9276898503303528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18415841460227966, "step": 21922 }, { "epoch": 0.43848, "grad_norm": 1.9921875, "grad_norm_var": 0.004941558837890625, "learning_rate": 0.0001, "loss": 4.1358, "loss/crossentropy": 2.1866860389709473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2306595891714096, "step": 21924 }, { "epoch": 0.43852, "grad_norm": 1.875, "grad_norm_var": 0.004378000895182292, "learning_rate": 0.0001, "loss": 3.9259, "loss/crossentropy": 1.8013821840286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17407341301441193, "step": 21926 }, { "epoch": 0.43856, "grad_norm": 1.8359375, "grad_norm_var": 0.005110422770182292, "learning_rate": 0.0001, "loss": 3.9644, "loss/crossentropy": 2.116545557975769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19539258629083633, "step": 21928 }, { "epoch": 0.4386, "grad_norm": 1.8359375, "grad_norm_var": 0.006009928385416667, "learning_rate": 0.0001, "loss": 3.8005, "loss/crossentropy": 2.039593815803528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793600618839264, "step": 21930 }, { "epoch": 0.43864, "grad_norm": 1.796875, "grad_norm_var": 0.008385976155598959, "learning_rate": 0.0001, "loss": 3.9727, "loss/crossentropy": 2.0602275133132935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17620772868394852, "step": 21932 }, { "epoch": 0.43868, "grad_norm": 1.9140625, "grad_norm_var": 0.007024892171223958, "learning_rate": 0.0001, "loss": 3.7552, "loss/crossentropy": 1.9316160082817078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026260495185852, "step": 21934 }, { "epoch": 0.43872, "grad_norm": 1.984375, "grad_norm_var": 0.007004547119140625, "learning_rate": 0.0001, "loss": 4.0254, "loss/crossentropy": 2.152816414833069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18020687997341156, "step": 21936 }, { "epoch": 0.43876, "grad_norm": 1.8359375, "grad_norm_var": 0.0067626953125, "learning_rate": 0.0001, "loss": 3.8211, "loss/crossentropy": 1.8688368201255798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17652328312397003, "step": 21938 }, { "epoch": 0.4388, "grad_norm": 1.96875, "grad_norm_var": 0.006628163655598958, "learning_rate": 0.0001, "loss": 4.162, "loss/crossentropy": 1.8889214992523193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18263526260852814, "step": 21940 }, { "epoch": 0.43884, "grad_norm": 1.7578125, "grad_norm_var": 0.007869466145833334, "learning_rate": 0.0001, "loss": 3.8644, "loss/crossentropy": 1.9699294567108154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170155718922615, "step": 21942 }, { "epoch": 0.43888, "grad_norm": 1.9375, "grad_norm_var": 0.009642537434895833, "learning_rate": 0.0001, "loss": 4.1007, "loss/crossentropy": 1.7749728560447693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18438361585140228, "step": 21944 }, { "epoch": 0.43892, "grad_norm": 1.9765625, "grad_norm_var": 0.008910115559895833, "learning_rate": 0.0001, "loss": 4.0266, "loss/crossentropy": 2.1438393592834473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22740208357572556, "step": 21946 }, { "epoch": 0.43896, "grad_norm": 1.9375, "grad_norm_var": 0.005663045247395833, "learning_rate": 0.0001, "loss": 4.0983, "loss/crossentropy": 2.1517677307128906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20089010894298553, "step": 21948 }, { "epoch": 0.439, "grad_norm": 1.9765625, "grad_norm_var": 0.006931304931640625, "learning_rate": 0.0001, "loss": 4.0286, "loss/crossentropy": 2.283053159713745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19754114747047424, "step": 21950 }, { "epoch": 0.43904, "grad_norm": 1.9140625, "grad_norm_var": 0.008438873291015624, "learning_rate": 0.0001, "loss": 4.3633, "loss/crossentropy": 2.121833860874176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20339152216911316, "step": 21952 }, { "epoch": 0.43908, "grad_norm": 2.0, "grad_norm_var": 0.007926177978515626, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 2.206274390220642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19716881215572357, "step": 21954 }, { "epoch": 0.43912, "grad_norm": 1.8203125, "grad_norm_var": 0.012749989827473959, "learning_rate": 0.0001, "loss": 4.185, "loss/crossentropy": 2.1501349210739136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20559798926115036, "step": 21956 }, { "epoch": 0.43916, "grad_norm": 2.0625, "grad_norm_var": 0.012088775634765625, "learning_rate": 0.0001, "loss": 3.9056, "loss/crossentropy": 2.0854570269584656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22488616406917572, "step": 21958 }, { "epoch": 0.4392, "grad_norm": 3.265625, "grad_norm_var": 0.11881103515625, "learning_rate": 0.0001, "loss": 3.651, "loss/crossentropy": 1.840592384338379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934354454278946, "step": 21960 }, { "epoch": 0.43924, "grad_norm": 1.8203125, "grad_norm_var": 0.12266006469726562, "learning_rate": 0.0001, "loss": 4.013, "loss/crossentropy": 2.062163829803467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19524814933538437, "step": 21962 }, { "epoch": 0.43928, "grad_norm": 1.765625, "grad_norm_var": 0.1280413309733073, "learning_rate": 0.0001, "loss": 3.8329, "loss/crossentropy": 1.7060028910636902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17902052402496338, "step": 21964 }, { "epoch": 0.43932, "grad_norm": 1.8046875, "grad_norm_var": 0.12742919921875, "learning_rate": 0.0001, "loss": 4.0638, "loss/crossentropy": 2.246550440788269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088882103562355, "step": 21966 }, { "epoch": 0.43936, "grad_norm": 1.9375, "grad_norm_var": 0.12764053344726561, "learning_rate": 0.0001, "loss": 4.2095, "loss/crossentropy": 2.249360680580139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20600482821464539, "step": 21968 }, { "epoch": 0.4394, "grad_norm": 1.8125, "grad_norm_var": 0.13187840779622395, "learning_rate": 0.0001, "loss": 4.2291, "loss/crossentropy": 2.029528558254242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18438928574323654, "step": 21970 }, { "epoch": 0.43944, "grad_norm": 1.8828125, "grad_norm_var": 0.1287249247233073, "learning_rate": 0.0001, "loss": 4.0865, "loss/crossentropy": 2.0363243222236633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402872025966644, "step": 21972 }, { "epoch": 0.43948, "grad_norm": 2.015625, "grad_norm_var": 0.12544657389322916, "learning_rate": 0.0001, "loss": 4.2188, "loss/crossentropy": 2.1816266775131226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20053647458553314, "step": 21974 }, { "epoch": 0.43952, "grad_norm": 2.015625, "grad_norm_var": 0.01646703084309896, "learning_rate": 0.0001, "loss": 4.2621, "loss/crossentropy": 2.452013850212097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117229700088501, "step": 21976 }, { "epoch": 0.43956, "grad_norm": 1.96875, "grad_norm_var": 0.0123779296875, "learning_rate": 0.0001, "loss": 4.0617, "loss/crossentropy": 2.1460453271865845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046569585800171, "step": 21978 }, { "epoch": 0.4396, "grad_norm": 1.953125, "grad_norm_var": 0.009639485677083334, "learning_rate": 0.0001, "loss": 4.1805, "loss/crossentropy": 2.152291774749756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19845052808523178, "step": 21980 }, { "epoch": 0.43964, "grad_norm": 2.03125, "grad_norm_var": 0.007950592041015624, "learning_rate": 0.0001, "loss": 4.3282, "loss/crossentropy": 2.280704140663147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20648686587810516, "step": 21982 }, { "epoch": 0.43968, "grad_norm": 1.8359375, "grad_norm_var": 0.009004465738932292, "learning_rate": 0.0001, "loss": 4.146, "loss/crossentropy": 2.2323286533355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1949191614985466, "step": 21984 }, { "epoch": 0.43972, "grad_norm": 2.046875, "grad_norm_var": 0.006135813395182292, "learning_rate": 0.0001, "loss": 3.8289, "loss/crossentropy": 2.105699062347412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19488361477851868, "step": 21986 }, { "epoch": 0.43976, "grad_norm": 1.6875, "grad_norm_var": 0.009834543863932291, "learning_rate": 0.0001, "loss": 3.731, "loss/crossentropy": 1.7268863916397095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16756988316774368, "step": 21988 }, { "epoch": 0.4398, "grad_norm": 1.953125, "grad_norm_var": 0.009458160400390625, "learning_rate": 0.0001, "loss": 4.0178, "loss/crossentropy": 2.1496203541755676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20237596333026886, "step": 21990 }, { "epoch": 0.43984, "grad_norm": 1.984375, "grad_norm_var": 0.009350331624348958, "learning_rate": 0.0001, "loss": 4.1835, "loss/crossentropy": 2.196579158306122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20089206099510193, "step": 21992 }, { "epoch": 0.43988, "grad_norm": 2.140625, "grad_norm_var": 0.011958821614583334, "learning_rate": 0.0001, "loss": 4.0065, "loss/crossentropy": 2.1145724654197693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21221736818552017, "step": 21994 }, { "epoch": 0.43992, "grad_norm": 1.9921875, "grad_norm_var": 0.012230428059895833, "learning_rate": 0.0001, "loss": 4.1096, "loss/crossentropy": 1.8034029603004456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18355921655893326, "step": 21996 }, { "epoch": 0.43996, "grad_norm": 2.015625, "grad_norm_var": 0.012262980143229166, "learning_rate": 0.0001, "loss": 4.1805, "loss/crossentropy": 1.8520742654800415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18945766985416412, "step": 21998 }, { "epoch": 0.44, "grad_norm": 2.0625, "grad_norm_var": 0.012962849934895833, "learning_rate": 0.0001, "loss": 4.4938, "loss/crossentropy": 2.0837312936782837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22002845257520676, "step": 22000 }, { "epoch": 0.44004, "grad_norm": 1.859375, "grad_norm_var": 0.011470540364583334, "learning_rate": 0.0001, "loss": 4.0409, "loss/crossentropy": 1.9202563166618347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19138525426387787, "step": 22002 }, { "epoch": 0.44008, "grad_norm": 1.8984375, "grad_norm_var": 0.006506093343098958, "learning_rate": 0.0001, "loss": 3.9965, "loss/crossentropy": 2.0986216068267822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18914636224508286, "step": 22004 }, { "epoch": 0.44012, "grad_norm": 1.953125, "grad_norm_var": 0.008955637613932291, "learning_rate": 0.0001, "loss": 3.5102, "loss/crossentropy": 1.675586223602295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17793113738298416, "step": 22006 }, { "epoch": 0.44016, "grad_norm": 2.0625, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 4.0252, "loss/crossentropy": 1.7633379697799683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17190124094486237, "step": 22008 }, { "epoch": 0.4402, "grad_norm": 1.8203125, "grad_norm_var": 0.008796183268229167, "learning_rate": 0.0001, "loss": 3.925, "loss/crossentropy": 2.0000420212745667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18909277766942978, "step": 22010 }, { "epoch": 0.44024, "grad_norm": 1.8203125, "grad_norm_var": 0.009691365559895833, "learning_rate": 0.0001, "loss": 3.8831, "loss/crossentropy": 2.2526057958602905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19274260103702545, "step": 22012 }, { "epoch": 0.44028, "grad_norm": 1.859375, "grad_norm_var": 0.010261027018229167, "learning_rate": 0.0001, "loss": 3.9836, "loss/crossentropy": 2.1151334643363953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992139369249344, "step": 22014 }, { "epoch": 0.44032, "grad_norm": 1.84375, "grad_norm_var": 0.008546702067057292, "learning_rate": 0.0001, "loss": 3.932, "loss/crossentropy": 2.188230037689209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205021433532238, "step": 22016 }, { "epoch": 0.44036, "grad_norm": 1.984375, "grad_norm_var": 0.008329264322916667, "learning_rate": 0.0001, "loss": 3.9484, "loss/crossentropy": 2.015773594379425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19697444140911102, "step": 22018 }, { "epoch": 0.4404, "grad_norm": 1.7890625, "grad_norm_var": 0.008882649739583333, "learning_rate": 0.0001, "loss": 3.7165, "loss/crossentropy": 1.724815011024475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17810098081827164, "step": 22020 }, { "epoch": 0.44044, "grad_norm": 2.09375, "grad_norm_var": 0.010091145833333334, "learning_rate": 0.0001, "loss": 4.3719, "loss/crossentropy": 2.471455454826355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20882685482501984, "step": 22022 }, { "epoch": 0.44048, "grad_norm": 1.9765625, "grad_norm_var": 0.009212239583333334, "learning_rate": 0.0001, "loss": 4.2544, "loss/crossentropy": 2.2879083156585693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20954442769289017, "step": 22024 }, { "epoch": 0.44052, "grad_norm": 1.9453125, "grad_norm_var": 0.008440907796223958, "learning_rate": 0.0001, "loss": 4.1305, "loss/crossentropy": 2.352377772331238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21374861150979996, "step": 22026 }, { "epoch": 0.44056, "grad_norm": 1.875, "grad_norm_var": 0.010823313395182292, "learning_rate": 0.0001, "loss": 3.889, "loss/crossentropy": 1.5395016074180603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17564409226179123, "step": 22028 }, { "epoch": 0.4406, "grad_norm": 1.890625, "grad_norm_var": 0.009440104166666666, "learning_rate": 0.0001, "loss": 3.997, "loss/crossentropy": 2.079974055290222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886643022298813, "step": 22030 }, { "epoch": 0.44064, "grad_norm": 2.21875, "grad_norm_var": 0.013110097249348958, "learning_rate": 0.0001, "loss": 4.3671, "loss/crossentropy": 2.0840702056884766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21841587871313095, "step": 22032 }, { "epoch": 0.44068, "grad_norm": 2.125, "grad_norm_var": 0.014168294270833333, "learning_rate": 0.0001, "loss": 4.2372, "loss/crossentropy": 2.574951171875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21359464526176453, "step": 22034 }, { "epoch": 0.44072, "grad_norm": 2.234375, "grad_norm_var": 0.011519368489583333, "learning_rate": 0.0001, "loss": 4.0261, "loss/crossentropy": 1.8530864715576172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19658754020929337, "step": 22036 }, { "epoch": 0.44076, "grad_norm": 1.7109375, "grad_norm_var": 0.018289947509765626, "learning_rate": 0.0001, "loss": 3.7077, "loss/crossentropy": 1.9564177989959717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19047683477401733, "step": 22038 }, { "epoch": 0.4408, "grad_norm": 2.078125, "grad_norm_var": 0.02148615519205729, "learning_rate": 0.0001, "loss": 4.2303, "loss/crossentropy": 2.2482646703720093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21484995633363724, "step": 22040 }, { "epoch": 0.44084, "grad_norm": 2.640625, "grad_norm_var": 0.04890925089518229, "learning_rate": 0.0001, "loss": 3.9753, "loss/crossentropy": 1.9064926505088806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042565643787384, "step": 22042 }, { "epoch": 0.44088, "grad_norm": 1.921875, "grad_norm_var": 0.048164621988932295, "learning_rate": 0.0001, "loss": 3.8492, "loss/crossentropy": 1.9772136211395264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18002107739448547, "step": 22044 }, { "epoch": 0.44092, "grad_norm": 1.9609375, "grad_norm_var": 0.047240193684895834, "learning_rate": 0.0001, "loss": 4.0417, "loss/crossentropy": 1.8454242944717407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19727011024951935, "step": 22046 }, { "epoch": 0.44096, "grad_norm": 1.8984375, "grad_norm_var": 0.047584788004557295, "learning_rate": 0.0001, "loss": 3.8247, "loss/crossentropy": 1.815159022808075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17201630771160126, "step": 22048 }, { "epoch": 0.441, "grad_norm": 2.1875, "grad_norm_var": 0.05176976521809896, "learning_rate": 0.0001, "loss": 4.0909, "loss/crossentropy": 2.298068404197693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21187184751033783, "step": 22050 }, { "epoch": 0.44104, "grad_norm": 2.046875, "grad_norm_var": 0.04794108072916667, "learning_rate": 0.0001, "loss": 3.9931, "loss/crossentropy": 2.4030654430389404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233939692378044, "step": 22052 }, { "epoch": 0.44108, "grad_norm": 1.859375, "grad_norm_var": 0.043981679280598956, "learning_rate": 0.0001, "loss": 3.924, "loss/crossentropy": 2.1413121223449707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19442930072546005, "step": 22054 }, { "epoch": 0.44112, "grad_norm": 1.9375, "grad_norm_var": 0.0421875, "learning_rate": 0.0001, "loss": 4.0048, "loss/crossentropy": 2.029422342777252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19413675367832184, "step": 22056 }, { "epoch": 0.44116, "grad_norm": 2.0625, "grad_norm_var": 0.012010701497395833, "learning_rate": 0.0001, "loss": 4.3637, "loss/crossentropy": 2.3010586500167847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198517844080925, "step": 22058 }, { "epoch": 0.4412, "grad_norm": 1.9296875, "grad_norm_var": 0.012353261311848959, "learning_rate": 0.0001, "loss": 4.0769, "loss/crossentropy": 2.0910425782203674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19917423278093338, "step": 22060 }, { "epoch": 0.44124, "grad_norm": 2.4375, "grad_norm_var": 0.027766672770182292, "learning_rate": 0.0001, "loss": 3.7656, "loss/crossentropy": 1.948616862297058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20419400930404663, "step": 22062 }, { "epoch": 0.44128, "grad_norm": 1.8515625, "grad_norm_var": 0.02814509073893229, "learning_rate": 0.0001, "loss": 4.1349, "loss/crossentropy": 2.0931553840637207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19765160977840424, "step": 22064 }, { "epoch": 0.44132, "grad_norm": 1.8671875, "grad_norm_var": 0.0232421875, "learning_rate": 0.0001, "loss": 4.1685, "loss/crossentropy": 2.0905882120132446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20447614789009094, "step": 22066 }, { "epoch": 0.44136, "grad_norm": 1.90625, "grad_norm_var": 0.022861480712890625, "learning_rate": 0.0001, "loss": 4.1407, "loss/crossentropy": 2.365593433380127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21229872107505798, "step": 22068 }, { "epoch": 0.4414, "grad_norm": 1.9453125, "grad_norm_var": 0.022093709309895834, "learning_rate": 0.0001, "loss": 4.2306, "loss/crossentropy": 2.286802887916565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844397693872452, "step": 22070 }, { "epoch": 0.44144, "grad_norm": 1.984375, "grad_norm_var": 0.021507771809895833, "learning_rate": 0.0001, "loss": 4.2739, "loss/crossentropy": 2.1805503368377686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20301498472690582, "step": 22072 }, { "epoch": 0.44148, "grad_norm": 2.03125, "grad_norm_var": 0.020702870686848958, "learning_rate": 0.0001, "loss": 4.0185, "loss/crossentropy": 2.025053381919861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959461346268654, "step": 22074 }, { "epoch": 0.44152, "grad_norm": 1.828125, "grad_norm_var": 0.02149658203125, "learning_rate": 0.0001, "loss": 4.1134, "loss/crossentropy": 2.1158708930015564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19305311143398285, "step": 22076 }, { "epoch": 0.44156, "grad_norm": 1.8984375, "grad_norm_var": 0.0063059488932291664, "learning_rate": 0.0001, "loss": 3.8749, "loss/crossentropy": 1.746639907360077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18400797247886658, "step": 22078 }, { "epoch": 0.4416, "grad_norm": 1.9375, "grad_norm_var": 0.0041168212890625, "learning_rate": 0.0001, "loss": 4.3071, "loss/crossentropy": 2.3301517963409424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19662386924028397, "step": 22080 }, { "epoch": 0.44164, "grad_norm": 1.90625, "grad_norm_var": 0.0036272684733072917, "learning_rate": 0.0001, "loss": 4.1136, "loss/crossentropy": 1.9772114753723145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18933425843715668, "step": 22082 }, { "epoch": 0.44168, "grad_norm": 1.9296875, "grad_norm_var": 0.0036069234212239582, "learning_rate": 0.0001, "loss": 4.0125, "loss/crossentropy": 1.9511193633079529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19440795481204987, "step": 22084 }, { "epoch": 0.44172, "grad_norm": 1.7890625, "grad_norm_var": 0.005785115559895833, "learning_rate": 0.0001, "loss": 3.6898, "loss/crossentropy": 1.7232664823532104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16435310244560242, "step": 22086 }, { "epoch": 0.44176, "grad_norm": 1.9453125, "grad_norm_var": 0.005549875895182291, "learning_rate": 0.0001, "loss": 4.0631, "loss/crossentropy": 2.1858248710632324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20494847744703293, "step": 22088 }, { "epoch": 0.4418, "grad_norm": 1.921875, "grad_norm_var": 0.01435546875, "learning_rate": 0.0001, "loss": 3.9496, "loss/crossentropy": 2.4218143224716187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21809446811676025, "step": 22090 }, { "epoch": 0.44184, "grad_norm": 1.8671875, "grad_norm_var": 0.014168039957682291, "learning_rate": 0.0001, "loss": 4.0209, "loss/crossentropy": 2.133104920387268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930398792028427, "step": 22092 }, { "epoch": 0.44188, "grad_norm": 1.9609375, "grad_norm_var": 0.013348134358723958, "learning_rate": 0.0001, "loss": 4.0959, "loss/crossentropy": 2.113961935043335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2095598503947258, "step": 22094 }, { "epoch": 0.44192, "grad_norm": 1.8671875, "grad_norm_var": 0.0151763916015625, "learning_rate": 0.0001, "loss": 3.7976, "loss/crossentropy": 1.984375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18821635842323303, "step": 22096 }, { "epoch": 0.44196, "grad_norm": 1.953125, "grad_norm_var": 0.0153717041015625, "learning_rate": 0.0001, "loss": 3.8215, "loss/crossentropy": 1.8913013339042664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1868671551346779, "step": 22098 }, { "epoch": 0.442, "grad_norm": 1.84375, "grad_norm_var": 0.01610081990559896, "learning_rate": 0.0001, "loss": 3.8635, "loss/crossentropy": 1.9196743369102478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18776098638772964, "step": 22100 }, { "epoch": 0.44204, "grad_norm": 1.921875, "grad_norm_var": 0.013134511311848958, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 1.929152011871338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17069555073976517, "step": 22102 }, { "epoch": 0.44208, "grad_norm": 1.921875, "grad_norm_var": 0.0133941650390625, "learning_rate": 0.0001, "loss": 4.0115, "loss/crossentropy": 2.0993363857269287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19678544998168945, "step": 22104 }, { "epoch": 0.44212, "grad_norm": 1.9140625, "grad_norm_var": 0.004559071858723959, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 1.8447982668876648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18432488292455673, "step": 22106 }, { "epoch": 0.44216, "grad_norm": 1.96875, "grad_norm_var": 0.0041544596354166664, "learning_rate": 0.0001, "loss": 4.2359, "loss/crossentropy": 2.04882550239563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2129831686615944, "step": 22108 }, { "epoch": 0.4422, "grad_norm": 1.8515625, "grad_norm_var": 0.0047686258951822914, "learning_rate": 0.0001, "loss": 3.8738, "loss/crossentropy": 1.7422301769256592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18266545236110687, "step": 22110 }, { "epoch": 0.44224, "grad_norm": 1.96875, "grad_norm_var": 0.006664784749348959, "learning_rate": 0.0001, "loss": 4.2107, "loss/crossentropy": 2.1211363077163696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22843683511018753, "step": 22112 }, { "epoch": 0.44228, "grad_norm": 2.09375, "grad_norm_var": 0.008097076416015625, "learning_rate": 0.0001, "loss": 3.8527, "loss/crossentropy": 2.2478936910629272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21724677085876465, "step": 22114 }, { "epoch": 0.44232, "grad_norm": 1.84375, "grad_norm_var": 0.0153228759765625, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.3345694541931152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20749730616807938, "step": 22116 }, { "epoch": 0.44236, "grad_norm": 1.7265625, "grad_norm_var": 0.020076243082682292, "learning_rate": 0.0001, "loss": 3.947, "loss/crossentropy": 2.0583457946777344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18861139565706253, "step": 22118 }, { "epoch": 0.4424, "grad_norm": 2.015625, "grad_norm_var": 0.0246734619140625, "learning_rate": 0.0001, "loss": 3.7534, "loss/crossentropy": 2.06750625371933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18891629576683044, "step": 22120 }, { "epoch": 0.44244, "grad_norm": 2.109375, "grad_norm_var": 0.025272623697916666, "learning_rate": 0.0001, "loss": 4.2192, "loss/crossentropy": 1.8823494911193848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20006988942623138, "step": 22122 }, { "epoch": 0.44248, "grad_norm": 2.15625, "grad_norm_var": 0.026920572916666666, "learning_rate": 0.0001, "loss": 4.1285, "loss/crossentropy": 2.0374066829681396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20347502827644348, "step": 22124 }, { "epoch": 0.44252, "grad_norm": 1.9140625, "grad_norm_var": 0.026195271809895834, "learning_rate": 0.0001, "loss": 3.8961, "loss/crossentropy": 1.4977151155471802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15512564033269882, "step": 22126 }, { "epoch": 0.44256, "grad_norm": 1.984375, "grad_norm_var": 0.024346669514973957, "learning_rate": 0.0001, "loss": 3.9462, "loss/crossentropy": 1.8102646470069885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884431689977646, "step": 22128 }, { "epoch": 0.4426, "grad_norm": 1.8671875, "grad_norm_var": 0.024079386393229166, "learning_rate": 0.0001, "loss": 4.2244, "loss/crossentropy": 2.1330573558807373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19625350832939148, "step": 22130 }, { "epoch": 0.44264, "grad_norm": 1.90625, "grad_norm_var": 0.0181396484375, "learning_rate": 0.0001, "loss": 3.7348, "loss/crossentropy": 2.1866488456726074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21527475118637085, "step": 22132 }, { "epoch": 0.44268, "grad_norm": 2.0, "grad_norm_var": 0.013985188802083333, "learning_rate": 0.0001, "loss": 4.0073, "loss/crossentropy": 2.0211732387542725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20512627810239792, "step": 22134 }, { "epoch": 0.44272, "grad_norm": 1.8359375, "grad_norm_var": 0.010277303059895833, "learning_rate": 0.0001, "loss": 3.9338, "loss/crossentropy": 1.798271358013153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1809278130531311, "step": 22136 }, { "epoch": 0.44276, "grad_norm": 1.9296875, "grad_norm_var": 0.008927408854166667, "learning_rate": 0.0001, "loss": 4.0153, "loss/crossentropy": 1.7850923538208008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16607706993818283, "step": 22138 }, { "epoch": 0.4428, "grad_norm": 1.8828125, "grad_norm_var": 0.012361399332682292, "learning_rate": 0.0001, "loss": 4.0488, "loss/crossentropy": 2.1693010330200195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18788384646177292, "step": 22140 }, { "epoch": 0.44284, "grad_norm": 1.8515625, "grad_norm_var": 0.013242340087890625, "learning_rate": 0.0001, "loss": 4.0582, "loss/crossentropy": 2.124464750289917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21157250553369522, "step": 22142 }, { "epoch": 0.44288, "grad_norm": 1.8984375, "grad_norm_var": 0.018416086832682293, "learning_rate": 0.0001, "loss": 4.2624, "loss/crossentropy": 2.1086031794548035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727767795324326, "step": 22144 }, { "epoch": 0.44292, "grad_norm": 1.890625, "grad_norm_var": 0.018000284830729168, "learning_rate": 0.0001, "loss": 3.8286, "loss/crossentropy": 1.8891723155975342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20070044696331024, "step": 22146 }, { "epoch": 0.44296, "grad_norm": 1.9140625, "grad_norm_var": 0.015173085530598958, "learning_rate": 0.0001, "loss": 4.1181, "loss/crossentropy": 2.1405081748962402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19909673184156418, "step": 22148 }, { "epoch": 0.443, "grad_norm": 1.9609375, "grad_norm_var": 0.0155181884765625, "learning_rate": 0.0001, "loss": 4.0455, "loss/crossentropy": 2.2116858959198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970309540629387, "step": 22150 }, { "epoch": 0.44304, "grad_norm": 1.8828125, "grad_norm_var": 0.015183258056640624, "learning_rate": 0.0001, "loss": 3.8698, "loss/crossentropy": 1.9010364413261414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18096549808979034, "step": 22152 }, { "epoch": 0.44308, "grad_norm": 1.875, "grad_norm_var": 0.015607706705729167, "learning_rate": 0.0001, "loss": 4.0256, "loss/crossentropy": 2.030808746814728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20662673562765121, "step": 22154 }, { "epoch": 0.44312, "grad_norm": 2.265625, "grad_norm_var": 0.018138631184895834, "learning_rate": 0.0001, "loss": 4.2633, "loss/crossentropy": 1.9940236806869507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2860369235277176, "step": 22156 }, { "epoch": 0.44316, "grad_norm": 1.828125, "grad_norm_var": 0.01758397420247396, "learning_rate": 0.0001, "loss": 4.1577, "loss/crossentropy": 2.3639464378356934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355366170406342, "step": 22158 }, { "epoch": 0.4432, "grad_norm": 1.8984375, "grad_norm_var": 0.013199615478515624, "learning_rate": 0.0001, "loss": 3.9197, "loss/crossentropy": 2.247707962989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2214345633983612, "step": 22160 }, { "epoch": 0.44324, "grad_norm": 2.109375, "grad_norm_var": 0.014696248372395833, "learning_rate": 0.0001, "loss": 4.3906, "loss/crossentropy": 2.462310314178467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2597040832042694, "step": 22162 }, { "epoch": 0.44328, "grad_norm": 1.9140625, "grad_norm_var": 0.014890289306640625, "learning_rate": 0.0001, "loss": 4.0932, "loss/crossentropy": 1.9838054180145264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21189701557159424, "step": 22164 }, { "epoch": 0.44332, "grad_norm": 2.0625, "grad_norm_var": 0.015169270833333333, "learning_rate": 0.0001, "loss": 3.988, "loss/crossentropy": 2.086707830429077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23454946279525757, "step": 22166 }, { "epoch": 0.44336, "grad_norm": 1.8671875, "grad_norm_var": 0.014924875895182292, "learning_rate": 0.0001, "loss": 3.8447, "loss/crossentropy": 1.9489662051200867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19790088385343552, "step": 22168 }, { "epoch": 0.4434, "grad_norm": 1.8046875, "grad_norm_var": 0.016532389322916667, "learning_rate": 0.0001, "loss": 3.7006, "loss/crossentropy": 1.9342001676559448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17832304537296295, "step": 22170 }, { "epoch": 0.44344, "grad_norm": 2.171875, "grad_norm_var": 0.011139933268229167, "learning_rate": 0.0001, "loss": 4.353, "loss/crossentropy": 2.248290777206421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21428222954273224, "step": 22172 }, { "epoch": 0.44348, "grad_norm": 2.015625, "grad_norm_var": 0.009865061442057291, "learning_rate": 0.0001, "loss": 4.1017, "loss/crossentropy": 1.8474311232566833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1795397698879242, "step": 22174 }, { "epoch": 0.44352, "grad_norm": 2.03125, "grad_norm_var": 0.009187571207682292, "learning_rate": 0.0001, "loss": 4.1804, "loss/crossentropy": 1.9687572121620178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062787041068077, "step": 22176 }, { "epoch": 0.44356, "grad_norm": 1.7421875, "grad_norm_var": 0.011625162760416667, "learning_rate": 0.0001, "loss": 3.8598, "loss/crossentropy": 2.4766552448272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046542540192604, "step": 22178 }, { "epoch": 0.4436, "grad_norm": 1.9609375, "grad_norm_var": 0.011618804931640626, "learning_rate": 0.0001, "loss": 4.1097, "loss/crossentropy": 2.2163573503494263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21187163889408112, "step": 22180 }, { "epoch": 0.44364, "grad_norm": 2.0, "grad_norm_var": 0.010813140869140625, "learning_rate": 0.0001, "loss": 4.1702, "loss/crossentropy": 2.180974006652832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20932868123054504, "step": 22182 }, { "epoch": 0.44368, "grad_norm": 1.96875, "grad_norm_var": 0.014957682291666666, "learning_rate": 0.0001, "loss": 4.2237, "loss/crossentropy": 1.97287255525589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17987515777349472, "step": 22184 }, { "epoch": 0.44372, "grad_norm": 2.0, "grad_norm_var": 0.013679758707682291, "learning_rate": 0.0001, "loss": 4.0676, "loss/crossentropy": 1.9375402927398682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994931995868683, "step": 22186 }, { "epoch": 0.44376, "grad_norm": 1.859375, "grad_norm_var": 0.0117340087890625, "learning_rate": 0.0001, "loss": 3.9908, "loss/crossentropy": 2.152758836746216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19482193887233734, "step": 22188 }, { "epoch": 0.4438, "grad_norm": 1.953125, "grad_norm_var": 0.012450917561848959, "learning_rate": 0.0001, "loss": 4.223, "loss/crossentropy": 2.29486083984375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24136198312044144, "step": 22190 }, { "epoch": 0.44384, "grad_norm": 2.15625, "grad_norm_var": 0.015363566080729167, "learning_rate": 0.0001, "loss": 3.9714, "loss/crossentropy": 2.143070697784424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122703492641449, "step": 22192 }, { "epoch": 0.44388, "grad_norm": 2.046875, "grad_norm_var": 0.011229451497395833, "learning_rate": 0.0001, "loss": 3.8691, "loss/crossentropy": 2.0414949655532837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614571869373322, "step": 22194 }, { "epoch": 0.44392, "grad_norm": 2.296875, "grad_norm_var": 0.016951497395833334, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 2.137809634208679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22674410045146942, "step": 22196 }, { "epoch": 0.44396, "grad_norm": 2.171875, "grad_norm_var": 0.01871312459309896, "learning_rate": 0.0001, "loss": 4.4088, "loss/crossentropy": 2.292213559150696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18514274060726166, "step": 22198 }, { "epoch": 0.444, "grad_norm": 2.015625, "grad_norm_var": 0.016839345296223957, "learning_rate": 0.0001, "loss": 4.0419, "loss/crossentropy": 2.3552767038345337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21818525344133377, "step": 22200 }, { "epoch": 0.44404, "grad_norm": 2.015625, "grad_norm_var": 0.01602961222330729, "learning_rate": 0.0001, "loss": 4.2912, "loss/crossentropy": 2.1532652378082275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19477012753486633, "step": 22202 }, { "epoch": 0.44408, "grad_norm": 1.921875, "grad_norm_var": 0.012938435872395833, "learning_rate": 0.0001, "loss": 3.9805, "loss/crossentropy": 1.921963095664978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16913262009620667, "step": 22204 }, { "epoch": 0.44412, "grad_norm": 1.828125, "grad_norm_var": 0.015063222249348958, "learning_rate": 0.0001, "loss": 3.8596, "loss/crossentropy": 1.9791225790977478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18458476662635803, "step": 22206 }, { "epoch": 0.44416, "grad_norm": 1.8125, "grad_norm_var": 0.014595540364583333, "learning_rate": 0.0001, "loss": 4.0334, "loss/crossentropy": 2.0733718276023865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146601676940918, "step": 22208 }, { "epoch": 0.4442, "grad_norm": 1.875, "grad_norm_var": 0.015830230712890626, "learning_rate": 0.0001, "loss": 3.7326, "loss/crossentropy": 1.5904502272605896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16351275146007538, "step": 22210 }, { "epoch": 0.44424, "grad_norm": 1.8984375, "grad_norm_var": 0.009563954671223958, "learning_rate": 0.0001, "loss": 4.0214, "loss/crossentropy": 2.0840989351272583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21245559304952621, "step": 22212 }, { "epoch": 0.44428, "grad_norm": 2.046875, "grad_norm_var": 0.008025868733723959, "learning_rate": 0.0001, "loss": 3.9995, "loss/crossentropy": 2.0513384342193604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19362515211105347, "step": 22214 }, { "epoch": 0.44432, "grad_norm": 1.96875, "grad_norm_var": 0.010284169514973959, "learning_rate": 0.0001, "loss": 4.0989, "loss/crossentropy": 2.0588608980178833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839647963643074, "step": 22216 }, { "epoch": 0.44436, "grad_norm": 1.8046875, "grad_norm_var": 0.011742146809895833, "learning_rate": 0.0001, "loss": 4.0668, "loss/crossentropy": 2.0614622831344604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19217892736196518, "step": 22218 }, { "epoch": 0.4444, "grad_norm": 1.828125, "grad_norm_var": 0.013061269124348959, "learning_rate": 0.0001, "loss": 4.0823, "loss/crossentropy": 1.980837643146515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963345631957054, "step": 22220 }, { "epoch": 0.44444, "grad_norm": 1.7890625, "grad_norm_var": 0.014562733968098958, "learning_rate": 0.0001, "loss": 3.8748, "loss/crossentropy": 1.9071148037910461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20523779094219208, "step": 22222 }, { "epoch": 0.44448, "grad_norm": 2.078125, "grad_norm_var": 0.014020792643229167, "learning_rate": 0.0001, "loss": 4.2035, "loss/crossentropy": 2.0757131576538086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1940036118030548, "step": 22224 }, { "epoch": 0.44452, "grad_norm": 1.8671875, "grad_norm_var": 0.015105946858723959, "learning_rate": 0.0001, "loss": 3.8287, "loss/crossentropy": 1.8984442353248596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18971017003059387, "step": 22226 }, { "epoch": 0.44456, "grad_norm": 1.9375, "grad_norm_var": 0.014281972249348959, "learning_rate": 0.0001, "loss": 4.1821, "loss/crossentropy": 2.2503433227539062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21632683277130127, "step": 22228 }, { "epoch": 0.4446, "grad_norm": 1.9453125, "grad_norm_var": 0.011771647135416667, "learning_rate": 0.0001, "loss": 4.2548, "loss/crossentropy": 2.264014482498169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061793953180313, "step": 22230 }, { "epoch": 0.44464, "grad_norm": 2.015625, "grad_norm_var": 0.009279123942057292, "learning_rate": 0.0001, "loss": 4.1289, "loss/crossentropy": 2.2728497982025146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20043949782848358, "step": 22232 }, { "epoch": 0.44468, "grad_norm": 2.125, "grad_norm_var": 0.0100250244140625, "learning_rate": 0.0001, "loss": 4.199, "loss/crossentropy": 2.074263334274292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2182644009590149, "step": 22234 }, { "epoch": 0.44472, "grad_norm": 1.9140625, "grad_norm_var": 0.008780670166015626, "learning_rate": 0.0001, "loss": 3.9973, "loss/crossentropy": 2.1185666918754578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1989990770816803, "step": 22236 }, { "epoch": 0.44476, "grad_norm": 1.8828125, "grad_norm_var": 0.007028961181640625, "learning_rate": 0.0001, "loss": 3.9992, "loss/crossentropy": 1.6533132791519165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17203578352928162, "step": 22238 }, { "epoch": 0.4448, "grad_norm": 2.078125, "grad_norm_var": 0.008231353759765626, "learning_rate": 0.0001, "loss": 3.948, "loss/crossentropy": 2.2107443809509277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20056415349245071, "step": 22240 }, { "epoch": 0.44484, "grad_norm": 1.8515625, "grad_norm_var": 0.007938385009765625, "learning_rate": 0.0001, "loss": 3.793, "loss/crossentropy": 1.9579968452453613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1899353489279747, "step": 22242 }, { "epoch": 0.44488, "grad_norm": 2.140625, "grad_norm_var": 0.010945638020833334, "learning_rate": 0.0001, "loss": 4.012, "loss/crossentropy": 2.074744164943695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2323092818260193, "step": 22244 }, { "epoch": 0.44492, "grad_norm": 1.8125, "grad_norm_var": 0.012308756510416666, "learning_rate": 0.0001, "loss": 3.8848, "loss/crossentropy": 2.2182846069335938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18586912751197815, "step": 22246 }, { "epoch": 0.44496, "grad_norm": 2.09375, "grad_norm_var": 0.01761042277018229, "learning_rate": 0.0001, "loss": 4.3021, "loss/crossentropy": 2.2342774868011475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20816949754953384, "step": 22248 }, { "epoch": 0.445, "grad_norm": 1.8515625, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 3.8011, "loss/crossentropy": 2.103771924972534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20282650738954544, "step": 22250 }, { "epoch": 0.44504, "grad_norm": 2.0625, "grad_norm_var": 0.017350006103515624, "learning_rate": 0.0001, "loss": 4.1103, "loss/crossentropy": 2.010720193386078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18631578236818314, "step": 22252 }, { "epoch": 0.44508, "grad_norm": 1.9296875, "grad_norm_var": 0.0169097900390625, "learning_rate": 0.0001, "loss": 3.92, "loss/crossentropy": 1.8840174674987793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19280597567558289, "step": 22254 }, { "epoch": 0.44512, "grad_norm": 2.015625, "grad_norm_var": 0.0151031494140625, "learning_rate": 0.0001, "loss": 4.0503, "loss/crossentropy": 2.404397130012512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23199105262756348, "step": 22256 }, { "epoch": 0.44516, "grad_norm": 1.7578125, "grad_norm_var": 0.018021392822265624, "learning_rate": 0.0001, "loss": 3.7078, "loss/crossentropy": 2.11636883020401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20003244280815125, "step": 22258 }, { "epoch": 0.4452, "grad_norm": 1.8046875, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 4.1396, "loss/crossentropy": 1.8522619009017944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16992035508155823, "step": 22260 }, { "epoch": 0.44524, "grad_norm": 1.859375, "grad_norm_var": 0.01738459269205729, "learning_rate": 0.0001, "loss": 3.923, "loss/crossentropy": 2.243737578392029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175471931695938, "step": 22262 }, { "epoch": 0.44528, "grad_norm": 1.8046875, "grad_norm_var": 0.01102294921875, "learning_rate": 0.0001, "loss": 3.9973, "loss/crossentropy": 2.231359362602234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19454128295183182, "step": 22264 }, { "epoch": 0.44532, "grad_norm": 1.984375, "grad_norm_var": 0.0114898681640625, "learning_rate": 0.0001, "loss": 4.0373, "loss/crossentropy": 2.08256071805954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158442661166191, "step": 22266 }, { "epoch": 0.44536, "grad_norm": 1.796875, "grad_norm_var": 0.010520172119140626, "learning_rate": 0.0001, "loss": 3.9418, "loss/crossentropy": 2.089757025241852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029227539896965, "step": 22268 }, { "epoch": 0.4454, "grad_norm": 1.8046875, "grad_norm_var": 0.011017862955729167, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 2.0893616676330566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958554983139038, "step": 22270 }, { "epoch": 0.44544, "grad_norm": 2.0625, "grad_norm_var": 0.0145751953125, "learning_rate": 0.0001, "loss": 4.0382, "loss/crossentropy": 1.9625884890556335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20560869574546814, "step": 22272 }, { "epoch": 0.44548, "grad_norm": 2.015625, "grad_norm_var": 0.012360636393229167, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.4028927087783813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21448686718940735, "step": 22274 }, { "epoch": 0.44552, "grad_norm": 2.109375, "grad_norm_var": 0.010890452067057292, "learning_rate": 0.0001, "loss": 4.1805, "loss/crossentropy": 2.16468608379364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987794190645218, "step": 22276 }, { "epoch": 0.44556, "grad_norm": 1.9453125, "grad_norm_var": 0.010326131184895834, "learning_rate": 0.0001, "loss": 3.9558, "loss/crossentropy": 1.992479383945465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19877834618091583, "step": 22278 }, { "epoch": 0.4456, "grad_norm": 1.9375, "grad_norm_var": 0.008473459879557292, "learning_rate": 0.0001, "loss": 3.912, "loss/crossentropy": 2.0368640422821045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20028480142354965, "step": 22280 }, { "epoch": 0.44564, "grad_norm": 2.0, "grad_norm_var": 0.017753092447916667, "learning_rate": 0.0001, "loss": 4.4026, "loss/crossentropy": 2.252933144569397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19834356755018234, "step": 22282 }, { "epoch": 0.44568, "grad_norm": 1.84375, "grad_norm_var": 0.01628392537434896, "learning_rate": 0.0001, "loss": 3.9787, "loss/crossentropy": 1.8675458431243896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1786389946937561, "step": 22284 }, { "epoch": 0.44572, "grad_norm": 1.8828125, "grad_norm_var": 0.01455078125, "learning_rate": 0.0001, "loss": 4.254, "loss/crossentropy": 2.0105971097946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18405906856060028, "step": 22286 }, { "epoch": 0.44576, "grad_norm": 1.9765625, "grad_norm_var": 0.014598592122395834, "learning_rate": 0.0001, "loss": 3.8389, "loss/crossentropy": 1.8162254095077515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18923819810152054, "step": 22288 }, { "epoch": 0.4458, "grad_norm": 1.890625, "grad_norm_var": 0.01656061808268229, "learning_rate": 0.0001, "loss": 3.8374, "loss/crossentropy": 1.7614133954048157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19005076587200165, "step": 22290 }, { "epoch": 0.44584, "grad_norm": 2.0625, "grad_norm_var": 0.1847551981608073, "learning_rate": 0.0001, "loss": 4.135, "loss/crossentropy": 2.135009288787842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20304158329963684, "step": 22292 }, { "epoch": 0.44588, "grad_norm": 1.890625, "grad_norm_var": 0.18526611328125, "learning_rate": 0.0001, "loss": 3.9944, "loss/crossentropy": 1.897695004940033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19377534836530685, "step": 22294 }, { "epoch": 0.44592, "grad_norm": 2.03125, "grad_norm_var": 0.182763671875, "learning_rate": 0.0001, "loss": 4.1663, "loss/crossentropy": 2.0558266043663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20582793653011322, "step": 22296 }, { "epoch": 0.44596, "grad_norm": 2.015625, "grad_norm_var": 0.18102188110351564, "learning_rate": 0.0001, "loss": 3.9538, "loss/crossentropy": 2.329292058944702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20754558593034744, "step": 22298 }, { "epoch": 0.446, "grad_norm": 1.8203125, "grad_norm_var": 0.18606338500976563, "learning_rate": 0.0001, "loss": 3.8853, "loss/crossentropy": 2.2229275703430176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20154273509979248, "step": 22300 }, { "epoch": 0.44604, "grad_norm": 2.09375, "grad_norm_var": 0.18578465779622397, "learning_rate": 0.0001, "loss": 3.992, "loss/crossentropy": 1.9050685167312622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17552578449249268, "step": 22302 }, { "epoch": 0.44608, "grad_norm": 1.84375, "grad_norm_var": 0.18623021443684895, "learning_rate": 0.0001, "loss": 4.2241, "loss/crossentropy": 2.139374792575836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19087333232164383, "step": 22304 }, { "epoch": 0.44612, "grad_norm": 2.0, "grad_norm_var": 0.18642578125, "learning_rate": 0.0001, "loss": 4.1013, "loss/crossentropy": 1.8093907237052917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805548369884491, "step": 22306 }, { "epoch": 0.44616, "grad_norm": 1.859375, "grad_norm_var": 0.030443318684895835, "learning_rate": 0.0001, "loss": 3.7124, "loss/crossentropy": 1.8334048390388489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16753460466861725, "step": 22308 }, { "epoch": 0.4462, "grad_norm": 2.046875, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 4.1018, "loss/crossentropy": 2.1181896924972534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17999915778636932, "step": 22310 }, { "epoch": 0.44624, "grad_norm": 1.9453125, "grad_norm_var": 0.02830785115559896, "learning_rate": 0.0001, "loss": 3.9938, "loss/crossentropy": 2.3377938270568848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20874116569757462, "step": 22312 }, { "epoch": 0.44628, "grad_norm": 1.921875, "grad_norm_var": 0.028173828125, "learning_rate": 0.0001, "loss": 4.3027, "loss/crossentropy": 2.2557464838027954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159905582666397, "step": 22314 }, { "epoch": 0.44632, "grad_norm": 1.9296875, "grad_norm_var": 0.02478612263997396, "learning_rate": 0.0001, "loss": 4.1486, "loss/crossentropy": 1.690861165523529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1791190207004547, "step": 22316 }, { "epoch": 0.44636, "grad_norm": 1.921875, "grad_norm_var": 0.02339452107747396, "learning_rate": 0.0001, "loss": 3.9995, "loss/crossentropy": 2.282811760902405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278990000486374, "step": 22318 }, { "epoch": 0.4464, "grad_norm": 2.046875, "grad_norm_var": 0.023021443684895834, "learning_rate": 0.0001, "loss": 4.0971, "loss/crossentropy": 2.087849497795105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19879884272813797, "step": 22320 }, { "epoch": 0.44644, "grad_norm": 2.0, "grad_norm_var": 0.024181874593098958, "learning_rate": 0.0001, "loss": 4.2878, "loss/crossentropy": 1.7355778217315674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1880098134279251, "step": 22322 }, { "epoch": 0.44648, "grad_norm": 1.8515625, "grad_norm_var": 0.006556955973307291, "learning_rate": 0.0001, "loss": 3.935, "loss/crossentropy": 1.9930670857429504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195980504155159, "step": 22324 }, { "epoch": 0.44652, "grad_norm": 1.890625, "grad_norm_var": 0.006990305582682292, "learning_rate": 0.0001, "loss": 4.058, "loss/crossentropy": 2.2104332447052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003258973360062, "step": 22326 }, { "epoch": 0.44656, "grad_norm": 2.15625, "grad_norm_var": 0.009208170572916667, "learning_rate": 0.0001, "loss": 3.876, "loss/crossentropy": 1.812508225440979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18001863360404968, "step": 22328 }, { "epoch": 0.4466, "grad_norm": 2.09375, "grad_norm_var": 0.009992472330729167, "learning_rate": 0.0001, "loss": 4.2515, "loss/crossentropy": 2.0790328979492188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21385060250759125, "step": 22330 }, { "epoch": 0.44664, "grad_norm": 2.09375, "grad_norm_var": 0.010206858317057291, "learning_rate": 0.0001, "loss": 3.9865, "loss/crossentropy": 1.778535783290863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18019723147153854, "step": 22332 }, { "epoch": 0.44668, "grad_norm": 1.8671875, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 4.1055, "loss/crossentropy": 2.1208608746528625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075667828321457, "step": 22334 }, { "epoch": 0.44672, "grad_norm": 1.9140625, "grad_norm_var": 0.013523101806640625, "learning_rate": 0.0001, "loss": 3.8638, "loss/crossentropy": 1.9015109539031982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18012915551662445, "step": 22336 }, { "epoch": 0.44676, "grad_norm": 1.921875, "grad_norm_var": 0.01219482421875, "learning_rate": 0.0001, "loss": 3.9193, "loss/crossentropy": 1.844041883945465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18436504900455475, "step": 22338 }, { "epoch": 0.4468, "grad_norm": 1.90625, "grad_norm_var": 0.012889607747395834, "learning_rate": 0.0001, "loss": 3.8859, "loss/crossentropy": 1.730314016342163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16672005504369736, "step": 22340 }, { "epoch": 0.44684, "grad_norm": 2.03125, "grad_norm_var": 0.06477762858072916, "learning_rate": 0.0001, "loss": 3.9309, "loss/crossentropy": 2.385637402534485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20245376229286194, "step": 22342 }, { "epoch": 0.44688, "grad_norm": 1.9765625, "grad_norm_var": 0.06330337524414062, "learning_rate": 0.0001, "loss": 4.0168, "loss/crossentropy": 2.033573269844055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19226811826229095, "step": 22344 }, { "epoch": 0.44692, "grad_norm": 2.03125, "grad_norm_var": 0.06444498697916666, "learning_rate": 0.0001, "loss": 3.9613, "loss/crossentropy": 2.018698275089264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19148993492126465, "step": 22346 }, { "epoch": 0.44696, "grad_norm": 1.8671875, "grad_norm_var": 0.06483739217122396, "learning_rate": 0.0001, "loss": 4.0311, "loss/crossentropy": 1.7709164023399353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18092583864927292, "step": 22348 }, { "epoch": 0.447, "grad_norm": 2.15625, "grad_norm_var": 0.06496480305989584, "learning_rate": 0.0001, "loss": 4.1857, "loss/crossentropy": 1.9011916518211365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18185202777385712, "step": 22350 }, { "epoch": 0.44704, "grad_norm": 1.875, "grad_norm_var": 0.0629547119140625, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 1.9298787117004395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18417780846357346, "step": 22352 }, { "epoch": 0.44708, "grad_norm": 1.96875, "grad_norm_var": 0.0615875244140625, "learning_rate": 0.0001, "loss": 4.0872, "loss/crossentropy": 2.3658339977264404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2167125567793846, "step": 22354 }, { "epoch": 0.44712, "grad_norm": 1.9609375, "grad_norm_var": 0.06006571451822917, "learning_rate": 0.0001, "loss": 4.092, "loss/crossentropy": 2.150742769241333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21436245739459991, "step": 22356 }, { "epoch": 0.44716, "grad_norm": 1.796875, "grad_norm_var": 0.007879384358723958, "learning_rate": 0.0001, "loss": 4.099, "loss/crossentropy": 2.197741746902466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19714494049549103, "step": 22358 }, { "epoch": 0.4472, "grad_norm": 1.8671875, "grad_norm_var": 0.008286285400390624, "learning_rate": 0.0001, "loss": 4.0584, "loss/crossentropy": 2.11471688747406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19025350362062454, "step": 22360 }, { "epoch": 0.44724, "grad_norm": 2.015625, "grad_norm_var": 0.0075724283854166664, "learning_rate": 0.0001, "loss": 3.9815, "loss/crossentropy": 2.2634165287017822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038334459066391, "step": 22362 }, { "epoch": 0.44728, "grad_norm": 1.984375, "grad_norm_var": 0.007181549072265625, "learning_rate": 0.0001, "loss": 4.1587, "loss/crossentropy": 2.3305805921554565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206403762102127, "step": 22364 }, { "epoch": 0.44732, "grad_norm": 1.921875, "grad_norm_var": 0.0032867431640625, "learning_rate": 0.0001, "loss": 3.9196, "loss/crossentropy": 1.8015141487121582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1901993378996849, "step": 22366 }, { "epoch": 0.44736, "grad_norm": 2.40625, "grad_norm_var": 0.017256673177083334, "learning_rate": 0.0001, "loss": 4.3285, "loss/crossentropy": 2.325510263442993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076701521873474, "step": 22368 }, { "epoch": 0.4474, "grad_norm": 1.8203125, "grad_norm_var": 0.0187652587890625, "learning_rate": 0.0001, "loss": 4.0401, "loss/crossentropy": 1.9470626711845398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854722648859024, "step": 22370 }, { "epoch": 0.44744, "grad_norm": 1.78125, "grad_norm_var": 0.020442708333333334, "learning_rate": 0.0001, "loss": 3.9825, "loss/crossentropy": 1.7524075508117676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17917660623788834, "step": 22372 }, { "epoch": 0.44748, "grad_norm": 1.921875, "grad_norm_var": 0.018480428059895835, "learning_rate": 0.0001, "loss": 3.5927, "loss/crossentropy": 2.0228232741355896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19089559465646744, "step": 22374 }, { "epoch": 0.44752, "grad_norm": 1.9375, "grad_norm_var": 0.019846343994140626, "learning_rate": 0.0001, "loss": 4.0478, "loss/crossentropy": 2.289384961128235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21693934500217438, "step": 22376 }, { "epoch": 0.44756, "grad_norm": 2.125, "grad_norm_var": 0.021996053059895833, "learning_rate": 0.0001, "loss": 3.6972, "loss/crossentropy": 1.9497195482254028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18409277498722076, "step": 22378 }, { "epoch": 0.4476, "grad_norm": 1.953125, "grad_norm_var": 0.022004191080729166, "learning_rate": 0.0001, "loss": 4.2471, "loss/crossentropy": 2.184138000011444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19307169318199158, "step": 22380 }, { "epoch": 0.44764, "grad_norm": 1.9765625, "grad_norm_var": 0.02208226521809896, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 2.0050132274627686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19592129439115524, "step": 22382 }, { "epoch": 0.44768, "grad_norm": 1.75, "grad_norm_var": 0.012475331624348959, "learning_rate": 0.0001, "loss": 3.8577, "loss/crossentropy": 1.725416898727417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17788676172494888, "step": 22384 }, { "epoch": 0.44772, "grad_norm": 2.140625, "grad_norm_var": 0.015103912353515625, "learning_rate": 0.0001, "loss": 3.8859, "loss/crossentropy": 1.8454214930534363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18375547230243683, "step": 22386 }, { "epoch": 0.44776, "grad_norm": 2.0, "grad_norm_var": 0.013166300455729167, "learning_rate": 0.0001, "loss": 4.1856, "loss/crossentropy": 2.090963661670685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20424450188875198, "step": 22388 }, { "epoch": 0.4478, "grad_norm": 1.875, "grad_norm_var": 0.014296213785807291, "learning_rate": 0.0001, "loss": 3.5071, "loss/crossentropy": 1.7257133722305298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18389710783958435, "step": 22390 }, { "epoch": 0.44784, "grad_norm": 2.015625, "grad_norm_var": 0.012153879801432291, "learning_rate": 0.0001, "loss": 4.2002, "loss/crossentropy": 2.0955962538719177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20340529829263687, "step": 22392 }, { "epoch": 0.44788, "grad_norm": 2.046875, "grad_norm_var": 0.010103098551432292, "learning_rate": 0.0001, "loss": 3.965, "loss/crossentropy": 1.8837090730667114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188998244702816, "step": 22394 }, { "epoch": 0.44792, "grad_norm": 1.875, "grad_norm_var": 0.010092926025390626, "learning_rate": 0.0001, "loss": 4.0777, "loss/crossentropy": 2.351726531982422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22628787904977798, "step": 22396 }, { "epoch": 0.44796, "grad_norm": 1.90625, "grad_norm_var": 0.010107421875, "learning_rate": 0.0001, "loss": 3.8849, "loss/crossentropy": 2.012966811656952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19502227008342743, "step": 22398 }, { "epoch": 0.448, "grad_norm": 1.890625, "grad_norm_var": 0.0080230712890625, "learning_rate": 0.0001, "loss": 3.9682, "loss/crossentropy": 1.6796467900276184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17925339192152023, "step": 22400 }, { "epoch": 0.44804, "grad_norm": 1.75, "grad_norm_var": 0.006696573893229167, "learning_rate": 0.0001, "loss": 3.7172, "loss/crossentropy": 2.1229045391082764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19038531184196472, "step": 22402 }, { "epoch": 0.44808, "grad_norm": 2.046875, "grad_norm_var": 0.007372792561848958, "learning_rate": 0.0001, "loss": 4.1287, "loss/crossentropy": 2.251511335372925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037583664059639, "step": 22404 }, { "epoch": 0.44812, "grad_norm": 1.8515625, "grad_norm_var": 0.007096099853515625, "learning_rate": 0.0001, "loss": 3.9975, "loss/crossentropy": 1.768878161907196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192366823554039, "step": 22406 }, { "epoch": 0.44816, "grad_norm": 1.78125, "grad_norm_var": 0.008835601806640624, "learning_rate": 0.0001, "loss": 3.7432, "loss/crossentropy": 1.9983880519866943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903810203075409, "step": 22408 }, { "epoch": 0.4482, "grad_norm": 1.859375, "grad_norm_var": 0.007783762613932292, "learning_rate": 0.0001, "loss": 4.1312, "loss/crossentropy": 2.0608294010162354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20538168400526047, "step": 22410 }, { "epoch": 0.44824, "grad_norm": 1.921875, "grad_norm_var": 0.0072509765625, "learning_rate": 0.0001, "loss": 4.0167, "loss/crossentropy": 2.137459099292755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062518671154976, "step": 22412 }, { "epoch": 0.44828, "grad_norm": 1.8515625, "grad_norm_var": 0.007437896728515625, "learning_rate": 0.0001, "loss": 3.8843, "loss/crossentropy": 2.090251922607422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19352033734321594, "step": 22414 }, { "epoch": 0.44832, "grad_norm": 1.9140625, "grad_norm_var": 0.006359608968098959, "learning_rate": 0.0001, "loss": 3.9818, "loss/crossentropy": 1.8260875344276428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17446549236774445, "step": 22416 }, { "epoch": 0.44836, "grad_norm": 1.96875, "grad_norm_var": 0.00504150390625, "learning_rate": 0.0001, "loss": 4.1228, "loss/crossentropy": 2.4598742723464966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792489290237427, "step": 22418 }, { "epoch": 0.4484, "grad_norm": 1.7734375, "grad_norm_var": 0.006113433837890625, "learning_rate": 0.0001, "loss": 4.0063, "loss/crossentropy": 2.120418429374695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20342590659856796, "step": 22420 }, { "epoch": 0.44844, "grad_norm": 1.90625, "grad_norm_var": 0.007795969645182292, "learning_rate": 0.0001, "loss": 4.2376, "loss/crossentropy": 2.0946252942085266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19902334362268448, "step": 22422 }, { "epoch": 0.44848, "grad_norm": 2.015625, "grad_norm_var": 0.007112375895182292, "learning_rate": 0.0001, "loss": 4.4449, "loss/crossentropy": 2.395688056945801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158007100224495, "step": 22424 }, { "epoch": 0.44852, "grad_norm": 1.796875, "grad_norm_var": 0.0090087890625, "learning_rate": 0.0001, "loss": 4.0946, "loss/crossentropy": 2.1429702043533325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130357325077057, "step": 22426 }, { "epoch": 0.44856, "grad_norm": 2.078125, "grad_norm_var": 0.010050201416015625, "learning_rate": 0.0001, "loss": 4.0105, "loss/crossentropy": 2.0070658922195435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955135241150856, "step": 22428 }, { "epoch": 0.4486, "grad_norm": 1.9375, "grad_norm_var": 0.009308878580729167, "learning_rate": 0.0001, "loss": 4.0902, "loss/crossentropy": 2.1505234241485596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20378431677818298, "step": 22430 }, { "epoch": 0.44864, "grad_norm": 1.9296875, "grad_norm_var": 0.0091705322265625, "learning_rate": 0.0001, "loss": 4.3029, "loss/crossentropy": 2.4345964193344116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19363048672676086, "step": 22432 }, { "epoch": 0.44868, "grad_norm": 1.8515625, "grad_norm_var": 0.010811360677083333, "learning_rate": 0.0001, "loss": 3.9744, "loss/crossentropy": 1.922882616519928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18355388939380646, "step": 22434 }, { "epoch": 0.44872, "grad_norm": 1.96875, "grad_norm_var": 0.008369954427083333, "learning_rate": 0.0001, "loss": 3.9232, "loss/crossentropy": 2.051816701889038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21039457619190216, "step": 22436 }, { "epoch": 0.44876, "grad_norm": 2.15625, "grad_norm_var": 0.009919230143229167, "learning_rate": 0.0001, "loss": 4.0486, "loss/crossentropy": 2.375608444213867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21171333640813828, "step": 22438 }, { "epoch": 0.4488, "grad_norm": 1.9921875, "grad_norm_var": 0.008937327067057292, "learning_rate": 0.0001, "loss": 4.2023, "loss/crossentropy": 2.02259361743927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248649150133133, "step": 22440 }, { "epoch": 0.44884, "grad_norm": 1.9765625, "grad_norm_var": 0.007193756103515625, "learning_rate": 0.0001, "loss": 4.1429, "loss/crossentropy": 2.010511100292206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19157880544662476, "step": 22442 }, { "epoch": 0.44888, "grad_norm": 2.0625, "grad_norm_var": 0.009030914306640625, "learning_rate": 0.0001, "loss": 4.4214, "loss/crossentropy": 2.3076666593551636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125840112566948, "step": 22444 }, { "epoch": 0.44892, "grad_norm": 1.8125, "grad_norm_var": 0.010227203369140625, "learning_rate": 0.0001, "loss": 3.772, "loss/crossentropy": 1.780324101448059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16909868270158768, "step": 22446 }, { "epoch": 0.44896, "grad_norm": 1.8984375, "grad_norm_var": 0.0105621337890625, "learning_rate": 0.0001, "loss": 4.2386, "loss/crossentropy": 2.1713266372680664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884608194231987, "step": 22448 }, { "epoch": 0.449, "grad_norm": 1.890625, "grad_norm_var": 0.009444173177083333, "learning_rate": 0.0001, "loss": 4.2591, "loss/crossentropy": 2.1273884773254395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.181084506213665, "step": 22450 }, { "epoch": 0.44904, "grad_norm": 1.8671875, "grad_norm_var": 0.009850819905598959, "learning_rate": 0.0001, "loss": 4.0999, "loss/crossentropy": 1.8685917258262634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984790861606598, "step": 22452 }, { "epoch": 0.44908, "grad_norm": 1.953125, "grad_norm_var": 0.006689453125, "learning_rate": 0.0001, "loss": 3.9955, "loss/crossentropy": 1.8099998831748962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186032235622406, "step": 22454 }, { "epoch": 0.44912, "grad_norm": 1.8671875, "grad_norm_var": 0.007063802083333333, "learning_rate": 0.0001, "loss": 3.8906, "loss/crossentropy": 2.111889600753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19286638498306274, "step": 22456 }, { "epoch": 0.44916, "grad_norm": 1.8125, "grad_norm_var": 0.008984120686848958, "learning_rate": 0.0001, "loss": 3.7561, "loss/crossentropy": 2.041996479034424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18484222888946533, "step": 22458 }, { "epoch": 0.4492, "grad_norm": 2.0625, "grad_norm_var": 0.0056874593098958336, "learning_rate": 0.0001, "loss": 3.8578, "loss/crossentropy": 1.755624771118164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1807885393500328, "step": 22460 }, { "epoch": 0.44924, "grad_norm": 1.8984375, "grad_norm_var": 0.0054840087890625, "learning_rate": 0.0001, "loss": 3.9186, "loss/crossentropy": 2.272615075111389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22077642381191254, "step": 22462 }, { "epoch": 0.44928, "grad_norm": 1.828125, "grad_norm_var": 0.0053179423014322914, "learning_rate": 0.0001, "loss": 3.7631, "loss/crossentropy": 2.1574344635009766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905694305896759, "step": 22464 }, { "epoch": 0.44932, "grad_norm": 2.0, "grad_norm_var": 0.008463287353515625, "learning_rate": 0.0001, "loss": 4.1746, "loss/crossentropy": 2.3979402780532837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21937933564186096, "step": 22466 }, { "epoch": 0.44936, "grad_norm": 1.984375, "grad_norm_var": 0.009250640869140625, "learning_rate": 0.0001, "loss": 4.0639, "loss/crossentropy": 2.181613326072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219841867685318, "step": 22468 }, { "epoch": 0.4494, "grad_norm": 2.03125, "grad_norm_var": 0.01041259765625, "learning_rate": 0.0001, "loss": 4.032, "loss/crossentropy": 2.2507534623146057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193026959896088, "step": 22470 }, { "epoch": 0.44944, "grad_norm": 1.984375, "grad_norm_var": 0.013278961181640625, "learning_rate": 0.0001, "loss": 4.3495, "loss/crossentropy": 2.288329839706421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20958562195301056, "step": 22472 }, { "epoch": 0.44948, "grad_norm": 1.96875, "grad_norm_var": 0.00838623046875, "learning_rate": 0.0001, "loss": 3.8597, "loss/crossentropy": 2.0730547308921814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21679005771875381, "step": 22474 }, { "epoch": 0.44952, "grad_norm": 1.96875, "grad_norm_var": 0.008036041259765625, "learning_rate": 0.0001, "loss": 4.2922, "loss/crossentropy": 2.1609703302383423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996021345257759, "step": 22476 }, { "epoch": 0.44956, "grad_norm": 2.078125, "grad_norm_var": 0.007738240559895833, "learning_rate": 0.0001, "loss": 4.048, "loss/crossentropy": 1.857498288154602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20019329339265823, "step": 22478 }, { "epoch": 0.4496, "grad_norm": 2.046875, "grad_norm_var": 0.0049468994140625, "learning_rate": 0.0001, "loss": 4.1772, "loss/crossentropy": 1.9688559174537659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18978293240070343, "step": 22480 }, { "epoch": 0.44964, "grad_norm": 1.9296875, "grad_norm_var": 0.005391184488932292, "learning_rate": 0.0001, "loss": 4.1759, "loss/crossentropy": 2.126068413257599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19416333734989166, "step": 22482 }, { "epoch": 0.44968, "grad_norm": 1.9375, "grad_norm_var": 0.0064361572265625, "learning_rate": 0.0001, "loss": 3.9915, "loss/crossentropy": 2.0684805512428284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21337899565696716, "step": 22484 }, { "epoch": 0.44972, "grad_norm": 1.9140625, "grad_norm_var": 0.007669830322265625, "learning_rate": 0.0001, "loss": 4.0065, "loss/crossentropy": 1.965733289718628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928843855857849, "step": 22486 }, { "epoch": 0.44976, "grad_norm": 1.9609375, "grad_norm_var": 0.0059478759765625, "learning_rate": 0.0001, "loss": 4.1346, "loss/crossentropy": 1.80375075340271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18274405598640442, "step": 22488 }, { "epoch": 0.4498, "grad_norm": 2.0, "grad_norm_var": 0.008284505208333333, "learning_rate": 0.0001, "loss": 3.8954, "loss/crossentropy": 1.939602553844452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18628022074699402, "step": 22490 }, { "epoch": 0.44984, "grad_norm": 1.890625, "grad_norm_var": 0.009276326497395833, "learning_rate": 0.0001, "loss": 4.0224, "loss/crossentropy": 2.408365488052368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21072806417942047, "step": 22492 }, { "epoch": 0.44988, "grad_norm": 1.953125, "grad_norm_var": 0.008961741129557292, "learning_rate": 0.0001, "loss": 4.0382, "loss/crossentropy": 2.1751617789268494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21095305681228638, "step": 22494 }, { "epoch": 0.44992, "grad_norm": 1.9375, "grad_norm_var": 0.03264745076497396, "learning_rate": 0.0001, "loss": 4.2429, "loss/crossentropy": 2.3348854780197144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21335425972938538, "step": 22496 }, { "epoch": 0.44996, "grad_norm": 1.9375, "grad_norm_var": 0.03258031209309896, "learning_rate": 0.0001, "loss": 3.9947, "loss/crossentropy": 2.096919059753418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19042403250932693, "step": 22498 }, { "epoch": 0.45, "grad_norm": 1.875, "grad_norm_var": 0.033056640625, "learning_rate": 0.0001, "loss": 3.8055, "loss/crossentropy": 1.950230062007904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19603446125984192, "step": 22500 }, { "epoch": 0.45004, "grad_norm": 2.0, "grad_norm_var": 0.032692209879557295, "learning_rate": 0.0001, "loss": 4.2118, "loss/crossentropy": 2.3010233640670776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23052462935447693, "step": 22502 }, { "epoch": 0.45008, "grad_norm": 2.03125, "grad_norm_var": 0.03386408487955729, "learning_rate": 0.0001, "loss": 3.8238, "loss/crossentropy": 2.143756926059723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19040528684854507, "step": 22504 }, { "epoch": 0.45012, "grad_norm": 2.3125, "grad_norm_var": 0.03865941365559896, "learning_rate": 0.0001, "loss": 4.194, "loss/crossentropy": 1.9510119557380676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19400011003017426, "step": 22506 }, { "epoch": 0.45016, "grad_norm": 2.203125, "grad_norm_var": 0.03951822916666667, "learning_rate": 0.0001, "loss": 4.0304, "loss/crossentropy": 2.2680559158325195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22951483726501465, "step": 22508 }, { "epoch": 0.4502, "grad_norm": 2.140625, "grad_norm_var": 0.19219741821289063, "learning_rate": 0.0001, "loss": 4.0042, "loss/crossentropy": 2.3191086053848267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179061621427536, "step": 22510 }, { "epoch": 0.45024, "grad_norm": 2.125, "grad_norm_var": 0.17602310180664063, "learning_rate": 0.0001, "loss": 4.1409, "loss/crossentropy": 2.2202601432800293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21782507002353668, "step": 22512 }, { "epoch": 0.45028, "grad_norm": 2.078125, "grad_norm_var": 0.17064208984375, "learning_rate": 0.0001, "loss": 4.1591, "loss/crossentropy": 1.9649160504341125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19763009250164032, "step": 22514 }, { "epoch": 0.45032, "grad_norm": 2.125, "grad_norm_var": 0.1665435791015625, "learning_rate": 0.0001, "loss": 4.4044, "loss/crossentropy": 2.0819711089134216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008282020688057, "step": 22516 }, { "epoch": 0.45036, "grad_norm": 1.890625, "grad_norm_var": 0.26408589680989586, "learning_rate": 0.0001, "loss": 4.2564, "loss/crossentropy": 2.2396440505981445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21240604668855667, "step": 22518 }, { "epoch": 0.4504, "grad_norm": 1.9296875, "grad_norm_var": 0.268542226155599, "learning_rate": 0.0001, "loss": 3.9298, "loss/crossentropy": 2.21976238489151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20444615930318832, "step": 22520 }, { "epoch": 0.45044, "grad_norm": 2.015625, "grad_norm_var": 0.2693072001139323, "learning_rate": 0.0001, "loss": 4.075, "loss/crossentropy": 2.2542803287506104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2205461636185646, "step": 22522 }, { "epoch": 0.45048, "grad_norm": 2.0, "grad_norm_var": 0.2702555338541667, "learning_rate": 0.0001, "loss": 3.9809, "loss/crossentropy": 1.7875661253929138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17205239832401276, "step": 22524 }, { "epoch": 0.45052, "grad_norm": 1.9609375, "grad_norm_var": 0.13923238118489584, "learning_rate": 0.0001, "loss": 3.875, "loss/crossentropy": 2.0036060214042664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21126192808151245, "step": 22526 }, { "epoch": 0.45056, "grad_norm": 2.125, "grad_norm_var": 0.13862279256184895, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 1.965367078781128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19563131779432297, "step": 22528 }, { "epoch": 0.4506, "grad_norm": 1.8671875, "grad_norm_var": 0.14239273071289063, "learning_rate": 0.0001, "loss": 4.158, "loss/crossentropy": 1.9605298042297363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932133436203003, "step": 22530 }, { "epoch": 0.45064, "grad_norm": 1.8515625, "grad_norm_var": 0.14579671223958332, "learning_rate": 0.0001, "loss": 3.9343, "loss/crossentropy": 2.1047326922416687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19184139370918274, "step": 22532 }, { "epoch": 0.45068, "grad_norm": 1.8984375, "grad_norm_var": 0.007104237874348958, "learning_rate": 0.0001, "loss": 4.299, "loss/crossentropy": 2.173476457595825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1873651072382927, "step": 22534 }, { "epoch": 0.45072, "grad_norm": 1.9453125, "grad_norm_var": 0.006379191080729167, "learning_rate": 0.0001, "loss": 4.0107, "loss/crossentropy": 1.928326666355133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937912181019783, "step": 22536 }, { "epoch": 0.45076, "grad_norm": 1.953125, "grad_norm_var": 0.00718994140625, "learning_rate": 0.0001, "loss": 3.9559, "loss/crossentropy": 2.2145062685012817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21003177762031555, "step": 22538 }, { "epoch": 0.4508, "grad_norm": 1.96875, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 3.9536, "loss/crossentropy": 1.9980989694595337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18583524972200394, "step": 22540 }, { "epoch": 0.45084, "grad_norm": 1.9453125, "grad_norm_var": 0.008552805582682291, "learning_rate": 0.0001, "loss": 4.0124, "loss/crossentropy": 2.0646828413009644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20016716420650482, "step": 22542 }, { "epoch": 0.45088, "grad_norm": 1.890625, "grad_norm_var": 0.006306711832682292, "learning_rate": 0.0001, "loss": 3.9468, "loss/crossentropy": 2.4558991193771362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1985267549753189, "step": 22544 }, { "epoch": 0.45092, "grad_norm": 2.0, "grad_norm_var": 0.006589508056640625, "learning_rate": 0.0001, "loss": 4.1356, "loss/crossentropy": 2.140208601951599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20745448023080826, "step": 22546 }, { "epoch": 0.45096, "grad_norm": 1.9296875, "grad_norm_var": 0.0058176676432291664, "learning_rate": 0.0001, "loss": 4.0322, "loss/crossentropy": 2.0344385504722595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20116931200027466, "step": 22548 }, { "epoch": 0.451, "grad_norm": 1.9296875, "grad_norm_var": 0.007306925455729167, "learning_rate": 0.0001, "loss": 4.1927, "loss/crossentropy": 2.213254153728485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145981639623642, "step": 22550 }, { "epoch": 0.45104, "grad_norm": 1.96875, "grad_norm_var": 0.007260894775390625, "learning_rate": 0.0001, "loss": 4.0203, "loss/crossentropy": 2.2077749967575073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086869403719902, "step": 22552 }, { "epoch": 0.45108, "grad_norm": 1.8984375, "grad_norm_var": 0.0064208984375, "learning_rate": 0.0001, "loss": 3.9152, "loss/crossentropy": 2.131449520587921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20110997557640076, "step": 22554 }, { "epoch": 0.45112, "grad_norm": 1.921875, "grad_norm_var": 0.004107411702473958, "learning_rate": 0.0001, "loss": 4.0194, "loss/crossentropy": 1.7281222343444824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18489576131105423, "step": 22556 }, { "epoch": 0.45116, "grad_norm": 3.4375, "grad_norm_var": 0.14352188110351563, "learning_rate": 0.0001, "loss": 3.9235, "loss/crossentropy": 1.9907403588294983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19740281254053116, "step": 22558 }, { "epoch": 0.4512, "grad_norm": 2.046875, "grad_norm_var": 0.14071451822916667, "learning_rate": 0.0001, "loss": 4.2471, "loss/crossentropy": 2.042698383331299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20195024460554123, "step": 22560 }, { "epoch": 0.45124, "grad_norm": 1.921875, "grad_norm_var": 0.14180272420247395, "learning_rate": 0.0001, "loss": 3.9868, "loss/crossentropy": 2.267351269721985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20345290750265121, "step": 22562 }, { "epoch": 0.45128, "grad_norm": 1.8515625, "grad_norm_var": 0.1418413798014323, "learning_rate": 0.0001, "loss": 3.8913, "loss/crossentropy": 2.031484067440033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18091221153736115, "step": 22564 }, { "epoch": 0.45132, "grad_norm": 2.125, "grad_norm_var": 0.14130757649739584, "learning_rate": 0.0001, "loss": 4.3266, "loss/crossentropy": 2.327489733695984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21997347474098206, "step": 22566 }, { "epoch": 0.45136, "grad_norm": 2.046875, "grad_norm_var": 0.1402099609375, "learning_rate": 0.0001, "loss": 4.3752, "loss/crossentropy": 2.117951452732086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1775515079498291, "step": 22568 }, { "epoch": 0.4514, "grad_norm": 1.90625, "grad_norm_var": 0.1423906962076823, "learning_rate": 0.0001, "loss": 3.9752, "loss/crossentropy": 1.9014524817466736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19188529253005981, "step": 22570 }, { "epoch": 0.45144, "grad_norm": 2.09375, "grad_norm_var": 0.13876851399739584, "learning_rate": 0.0001, "loss": 4.2163, "loss/crossentropy": 2.0534915924072266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20141670107841492, "step": 22572 }, { "epoch": 0.45148, "grad_norm": 2.203125, "grad_norm_var": 0.009810384114583333, "learning_rate": 0.0001, "loss": 4.3861, "loss/crossentropy": 2.018342673778534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20101028680801392, "step": 22574 }, { "epoch": 0.45152, "grad_norm": 1.8515625, "grad_norm_var": 0.011755116780598958, "learning_rate": 0.0001, "loss": 4.0395, "loss/crossentropy": 2.000533401966095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18452423810958862, "step": 22576 }, { "epoch": 0.45156, "grad_norm": 2.015625, "grad_norm_var": 0.012015533447265626, "learning_rate": 0.0001, "loss": 3.9407, "loss/crossentropy": 2.109315812587738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20570579916238785, "step": 22578 }, { "epoch": 0.4516, "grad_norm": 1.9375, "grad_norm_var": 0.011057281494140625, "learning_rate": 0.0001, "loss": 4.1289, "loss/crossentropy": 2.2056689262390137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195539653301239, "step": 22580 }, { "epoch": 0.45164, "grad_norm": 1.9609375, "grad_norm_var": 0.010162099202473959, "learning_rate": 0.0001, "loss": 4.0627, "loss/crossentropy": 2.246178388595581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960083767771721, "step": 22582 }, { "epoch": 0.45168, "grad_norm": 1.90625, "grad_norm_var": 0.010176340738932291, "learning_rate": 0.0001, "loss": 4.1413, "loss/crossentropy": 1.8758829832077026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19170167297124863, "step": 22584 }, { "epoch": 0.45172, "grad_norm": 1.8984375, "grad_norm_var": 0.0093994140625, "learning_rate": 0.0001, "loss": 3.9841, "loss/crossentropy": 1.9648401141166687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863914281129837, "step": 22586 }, { "epoch": 0.45176, "grad_norm": 2.015625, "grad_norm_var": 0.008339182535807291, "learning_rate": 0.0001, "loss": 4.2261, "loss/crossentropy": 2.2547377347946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003096491098404, "step": 22588 }, { "epoch": 0.4518, "grad_norm": 1.8828125, "grad_norm_var": 0.004626210530598958, "learning_rate": 0.0001, "loss": 3.9569, "loss/crossentropy": 1.9784765839576721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822790503501892, "step": 22590 }, { "epoch": 0.45184, "grad_norm": 1.96875, "grad_norm_var": 0.0026730855305989584, "learning_rate": 0.0001, "loss": 4.1475, "loss/crossentropy": 1.947198748588562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18320369720458984, "step": 22592 }, { "epoch": 0.45188, "grad_norm": 1.7578125, "grad_norm_var": 0.007625071207682291, "learning_rate": 0.0001, "loss": 3.5234, "loss/crossentropy": 1.934649407863617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1809309870004654, "step": 22594 }, { "epoch": 0.45192, "grad_norm": 1.8828125, "grad_norm_var": 0.009439849853515625, "learning_rate": 0.0001, "loss": 3.9333, "loss/crossentropy": 2.064886689186096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17977996170520782, "step": 22596 }, { "epoch": 0.45196, "grad_norm": 1.9375, "grad_norm_var": 0.011228179931640625, "learning_rate": 0.0001, "loss": 3.8852, "loss/crossentropy": 1.7243138551712036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16666193306446075, "step": 22598 }, { "epoch": 0.452, "grad_norm": 1.8515625, "grad_norm_var": 0.01080322265625, "learning_rate": 0.0001, "loss": 3.9807, "loss/crossentropy": 2.0566282272338867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19942068308591843, "step": 22600 }, { "epoch": 0.45204, "grad_norm": 1.765625, "grad_norm_var": 0.012078603108723959, "learning_rate": 0.0001, "loss": 3.9323, "loss/crossentropy": 2.108401298522949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068444862961769, "step": 22602 }, { "epoch": 0.45208, "grad_norm": 1.96875, "grad_norm_var": 0.011333974202473958, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 2.281686544418335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20834453403949738, "step": 22604 }, { "epoch": 0.45212, "grad_norm": 1.8984375, "grad_norm_var": 0.012324778238932292, "learning_rate": 0.0001, "loss": 4.0902, "loss/crossentropy": 1.8627066016197205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18476207554340363, "step": 22606 }, { "epoch": 0.45216, "grad_norm": 1.921875, "grad_norm_var": 0.011567942301432292, "learning_rate": 0.0001, "loss": 4.097, "loss/crossentropy": 2.46126925945282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19410762190818787, "step": 22608 }, { "epoch": 0.4522, "grad_norm": 1.7890625, "grad_norm_var": 0.009106190999348958, "learning_rate": 0.0001, "loss": 3.9222, "loss/crossentropy": 1.8764225244522095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21187759190797806, "step": 22610 }, { "epoch": 0.45224, "grad_norm": 1.8828125, "grad_norm_var": 0.007092030843098959, "learning_rate": 0.0001, "loss": 3.9213, "loss/crossentropy": 1.8535012602806091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1783105507493019, "step": 22612 }, { "epoch": 0.45228, "grad_norm": 1.96875, "grad_norm_var": 0.0054929097493489586, "learning_rate": 0.0001, "loss": 4.1948, "loss/crossentropy": 1.9735658764839172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2031656950712204, "step": 22614 }, { "epoch": 0.45232, "grad_norm": 1.8203125, "grad_norm_var": 0.0064389546712239586, "learning_rate": 0.0001, "loss": 3.8466, "loss/crossentropy": 2.005250871181488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18143242597579956, "step": 22616 }, { "epoch": 0.45236, "grad_norm": 4.84375, "grad_norm_var": 0.5445149739583334, "learning_rate": 0.0001, "loss": 4.1072, "loss/crossentropy": 2.1731566786766052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042900174856186, "step": 22618 }, { "epoch": 0.4524, "grad_norm": 1.921875, "grad_norm_var": 0.5424496968587239, "learning_rate": 0.0001, "loss": 3.867, "loss/crossentropy": 2.2009552717208862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20621322095394135, "step": 22620 }, { "epoch": 0.45244, "grad_norm": 2.09375, "grad_norm_var": 0.5397664388020833, "learning_rate": 0.0001, "loss": 3.9673, "loss/crossentropy": 2.2014458179473877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910918429493904, "step": 22622 }, { "epoch": 0.45248, "grad_norm": 1.921875, "grad_norm_var": 0.538287099202474, "learning_rate": 0.0001, "loss": 3.8493, "loss/crossentropy": 1.9113351702690125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981104016304016, "step": 22624 }, { "epoch": 0.45252, "grad_norm": 1.8515625, "grad_norm_var": 0.5362223307291667, "learning_rate": 0.0001, "loss": 3.9984, "loss/crossentropy": 1.7521483302116394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18573005497455597, "step": 22626 }, { "epoch": 0.45256, "grad_norm": 1.8046875, "grad_norm_var": 0.5408111572265625, "learning_rate": 0.0001, "loss": 3.8505, "loss/crossentropy": 2.003620207309723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799101158976555, "step": 22628 }, { "epoch": 0.4526, "grad_norm": 2.171875, "grad_norm_var": 0.5576894124348958, "learning_rate": 0.0001, "loss": 4.0585, "loss/crossentropy": 2.17133104801178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21434936672449112, "step": 22630 }, { "epoch": 0.45264, "grad_norm": 1.7890625, "grad_norm_var": 0.5505777994791666, "learning_rate": 0.0001, "loss": 3.8865, "loss/crossentropy": 2.113120198249817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21246708929538727, "step": 22632 }, { "epoch": 0.45268, "grad_norm": 1.9609375, "grad_norm_var": 0.04535319010416667, "learning_rate": 0.0001, "loss": 3.9273, "loss/crossentropy": 1.9400765299797058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19067071378231049, "step": 22634 }, { "epoch": 0.45272, "grad_norm": 1.921875, "grad_norm_var": 0.04595133463541667, "learning_rate": 0.0001, "loss": 4.1057, "loss/crossentropy": 2.107842206954956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903919279575348, "step": 22636 }, { "epoch": 0.45276, "grad_norm": 1.8515625, "grad_norm_var": 0.046529134114583336, "learning_rate": 0.0001, "loss": 3.969, "loss/crossentropy": 2.093311131000519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854369342327118, "step": 22638 }, { "epoch": 0.4528, "grad_norm": 1.9921875, "grad_norm_var": 0.04711278279622396, "learning_rate": 0.0001, "loss": 4.321, "loss/crossentropy": 2.2625142335891724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128172516822815, "step": 22640 }, { "epoch": 0.45284, "grad_norm": 1.890625, "grad_norm_var": 0.04737726847330729, "learning_rate": 0.0001, "loss": 3.9444, "loss/crossentropy": 2.13210928440094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18905573338270187, "step": 22642 }, { "epoch": 0.45288, "grad_norm": 1.75, "grad_norm_var": 0.04838841756184896, "learning_rate": 0.0001, "loss": 3.8324, "loss/crossentropy": 2.1068539023399353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21476633101701736, "step": 22644 }, { "epoch": 0.45292, "grad_norm": 1.921875, "grad_norm_var": 0.009903971354166667, "learning_rate": 0.0001, "loss": 4.0148, "loss/crossentropy": 1.9843144416809082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1885334774851799, "step": 22646 }, { "epoch": 0.45296, "grad_norm": 1.8828125, "grad_norm_var": 0.0125396728515625, "learning_rate": 0.0001, "loss": 4.0819, "loss/crossentropy": 2.3613446950912476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2183755785226822, "step": 22648 }, { "epoch": 0.453, "grad_norm": 1.9375, "grad_norm_var": 0.0125152587890625, "learning_rate": 0.0001, "loss": 3.9229, "loss/crossentropy": 2.2134616374969482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22959928214550018, "step": 22650 }, { "epoch": 0.45304, "grad_norm": 1.8828125, "grad_norm_var": 0.012532552083333334, "learning_rate": 0.0001, "loss": 4.0039, "loss/crossentropy": 2.332140803337097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20412423461675644, "step": 22652 }, { "epoch": 0.45308, "grad_norm": 2.03125, "grad_norm_var": 0.012601470947265625, "learning_rate": 0.0001, "loss": 4.1368, "loss/crossentropy": 1.7886313199996948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998751536011696, "step": 22654 }, { "epoch": 0.45312, "grad_norm": 1.7578125, "grad_norm_var": 0.012733713785807291, "learning_rate": 0.0001, "loss": 3.8701, "loss/crossentropy": 1.8666648864746094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16398675739765167, "step": 22656 }, { "epoch": 0.45316, "grad_norm": 2.015625, "grad_norm_var": 0.015077463785807292, "learning_rate": 0.0001, "loss": 3.9209, "loss/crossentropy": 1.7308924794197083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17301977425813675, "step": 22658 }, { "epoch": 0.4532, "grad_norm": 1.9921875, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 4.0125, "loss/crossentropy": 2.0344366431236267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18964581936597824, "step": 22660 }, { "epoch": 0.45324, "grad_norm": 1.984375, "grad_norm_var": 0.013991038004557291, "learning_rate": 0.0001, "loss": 4.0698, "loss/crossentropy": 2.1261265873908997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20520442724227905, "step": 22662 }, { "epoch": 0.45328, "grad_norm": 2.046875, "grad_norm_var": 0.008641560872395834, "learning_rate": 0.0001, "loss": 4.169, "loss/crossentropy": 2.163342595100403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064770758152008, "step": 22664 }, { "epoch": 0.45332, "grad_norm": 1.9609375, "grad_norm_var": 0.00960693359375, "learning_rate": 0.0001, "loss": 4.0267, "loss/crossentropy": 2.293798089027405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21860123425722122, "step": 22666 }, { "epoch": 0.45336, "grad_norm": 1.9375, "grad_norm_var": 0.009891764322916666, "learning_rate": 0.0001, "loss": 3.8624, "loss/crossentropy": 2.1405990719795227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19475476443767548, "step": 22668 }, { "epoch": 0.4534, "grad_norm": 1.921875, "grad_norm_var": 0.0090240478515625, "learning_rate": 0.0001, "loss": 4.0807, "loss/crossentropy": 2.4012043476104736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21024177968502045, "step": 22670 }, { "epoch": 0.45344, "grad_norm": 2.015625, "grad_norm_var": 0.008182525634765625, "learning_rate": 0.0001, "loss": 4.1348, "loss/crossentropy": 1.8321769833564758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21026848256587982, "step": 22672 }, { "epoch": 0.45348, "grad_norm": 1.9375, "grad_norm_var": 0.005832672119140625, "learning_rate": 0.0001, "loss": 3.9761, "loss/crossentropy": 1.7738409042358398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17715871334075928, "step": 22674 }, { "epoch": 0.45352, "grad_norm": 1.828125, "grad_norm_var": 0.0066070556640625, "learning_rate": 0.0001, "loss": 4.1336, "loss/crossentropy": 2.406354308128357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20188595354557037, "step": 22676 }, { "epoch": 0.45356, "grad_norm": 1.71875, "grad_norm_var": 0.0100006103515625, "learning_rate": 0.0001, "loss": 3.8723, "loss/crossentropy": 1.977246344089508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766221970319748, "step": 22678 }, { "epoch": 0.4536, "grad_norm": 1.9296875, "grad_norm_var": 0.007922108968098958, "learning_rate": 0.0001, "loss": 3.9496, "loss/crossentropy": 2.2606882452964783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21319438517093658, "step": 22680 }, { "epoch": 0.45364, "grad_norm": 2.21875, "grad_norm_var": 0.013313547770182291, "learning_rate": 0.0001, "loss": 4.3151, "loss/crossentropy": 2.212466239929199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21307764202356339, "step": 22682 }, { "epoch": 0.45368, "grad_norm": 1.90625, "grad_norm_var": 0.012804921468098958, "learning_rate": 0.0001, "loss": 4.0279, "loss/crossentropy": 1.8246173858642578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18758808821439743, "step": 22684 }, { "epoch": 0.45372, "grad_norm": 1.8046875, "grad_norm_var": 0.013946278889973959, "learning_rate": 0.0001, "loss": 3.9354, "loss/crossentropy": 2.0831995010375977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112889140844345, "step": 22686 }, { "epoch": 0.45376, "grad_norm": 1.953125, "grad_norm_var": 0.013262685139973958, "learning_rate": 0.0001, "loss": 4.0611, "loss/crossentropy": 1.81759911775589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17542830109596252, "step": 22688 }, { "epoch": 0.4538, "grad_norm": 1.9453125, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 4.3459, "loss/crossentropy": 1.99278324842453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18871523439884186, "step": 22690 }, { "epoch": 0.45384, "grad_norm": 2.359375, "grad_norm_var": 0.02394383748372396, "learning_rate": 0.0001, "loss": 4.2162, "loss/crossentropy": 2.050497889518738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21048897504806519, "step": 22692 }, { "epoch": 0.45388, "grad_norm": 1.84375, "grad_norm_var": 0.01898981730143229, "learning_rate": 0.0001, "loss": 3.9088, "loss/crossentropy": 1.7781237959861755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1762016862630844, "step": 22694 }, { "epoch": 0.45392, "grad_norm": 1.921875, "grad_norm_var": 0.01878840128580729, "learning_rate": 0.0001, "loss": 3.8821, "loss/crossentropy": 1.8609422445297241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18131760507822037, "step": 22696 }, { "epoch": 0.45396, "grad_norm": 2.1875, "grad_norm_var": 0.018553670247395834, "learning_rate": 0.0001, "loss": 4.3945, "loss/crossentropy": 2.274345874786377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22067608684301376, "step": 22698 }, { "epoch": 0.454, "grad_norm": 2.109375, "grad_norm_var": 0.021036529541015626, "learning_rate": 0.0001, "loss": 4.1889, "loss/crossentropy": 2.0518780946731567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190098837018013, "step": 22700 }, { "epoch": 0.45404, "grad_norm": 1.8125, "grad_norm_var": 0.020654296875, "learning_rate": 0.0001, "loss": 4.0623, "loss/crossentropy": 1.879017412662506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17030316591262817, "step": 22702 }, { "epoch": 0.45408, "grad_norm": 1.8515625, "grad_norm_var": 0.022944895426432292, "learning_rate": 0.0001, "loss": 4.0942, "loss/crossentropy": 2.090358793735504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893768459558487, "step": 22704 }, { "epoch": 0.45412, "grad_norm": 1.9375, "grad_norm_var": 0.024095662434895835, "learning_rate": 0.0001, "loss": 4.0252, "loss/crossentropy": 1.8985764980316162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20475559681653976, "step": 22706 }, { "epoch": 0.45416, "grad_norm": 1.9453125, "grad_norm_var": 0.014662424723307291, "learning_rate": 0.0001, "loss": 4.0392, "loss/crossentropy": 2.1758298873901367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19116609543561935, "step": 22708 }, { "epoch": 0.4542, "grad_norm": 1.828125, "grad_norm_var": 0.014741770426432292, "learning_rate": 0.0001, "loss": 3.8379, "loss/crossentropy": 1.7963250279426575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838328242301941, "step": 22710 }, { "epoch": 0.45424, "grad_norm": 2.015625, "grad_norm_var": 0.015046183268229167, "learning_rate": 0.0001, "loss": 4.2924, "loss/crossentropy": 2.4492361545562744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21029194444417953, "step": 22712 }, { "epoch": 0.45428, "grad_norm": 1.921875, "grad_norm_var": 0.0115386962890625, "learning_rate": 0.0001, "loss": 3.9314, "loss/crossentropy": 2.10919725894928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20362409204244614, "step": 22714 }, { "epoch": 0.45432, "grad_norm": 1.8984375, "grad_norm_var": 0.008713531494140624, "learning_rate": 0.0001, "loss": 4.0962, "loss/crossentropy": 2.132240355014801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18763454258441925, "step": 22716 }, { "epoch": 0.45436, "grad_norm": 2.0625, "grad_norm_var": 0.008048502604166667, "learning_rate": 0.0001, "loss": 4.0287, "loss/crossentropy": 2.1648387908935547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2098705694079399, "step": 22718 }, { "epoch": 0.4544, "grad_norm": 1.8203125, "grad_norm_var": 0.008109283447265626, "learning_rate": 0.0001, "loss": 3.908, "loss/crossentropy": 2.2334399223327637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21508647501468658, "step": 22720 }, { "epoch": 0.45444, "grad_norm": 1.84375, "grad_norm_var": 0.006101226806640625, "learning_rate": 0.0001, "loss": 3.8521, "loss/crossentropy": 1.8207709193229675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1902608647942543, "step": 22722 }, { "epoch": 0.45448, "grad_norm": 1.84375, "grad_norm_var": 0.007191721598307292, "learning_rate": 0.0001, "loss": 4.0834, "loss/crossentropy": 2.094780683517456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20928193628787994, "step": 22724 }, { "epoch": 0.45452, "grad_norm": 1.8515625, "grad_norm_var": 0.006791178385416667, "learning_rate": 0.0001, "loss": 3.8648, "loss/crossentropy": 2.164665937423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19287051260471344, "step": 22726 }, { "epoch": 0.45456, "grad_norm": 1.890625, "grad_norm_var": 0.0062164306640625, "learning_rate": 0.0001, "loss": 3.9342, "loss/crossentropy": 2.2440634965896606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18155057728290558, "step": 22728 }, { "epoch": 0.4546, "grad_norm": 2.15625, "grad_norm_var": 0.008190663655598958, "learning_rate": 0.0001, "loss": 4.2287, "loss/crossentropy": 2.012277126312256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21760402619838715, "step": 22730 }, { "epoch": 0.45464, "grad_norm": 1.7890625, "grad_norm_var": 0.01033935546875, "learning_rate": 0.0001, "loss": 4.0072, "loss/crossentropy": 2.1217371225357056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944856494665146, "step": 22732 }, { "epoch": 0.45468, "grad_norm": 1.9453125, "grad_norm_var": 0.010300445556640624, "learning_rate": 0.0001, "loss": 3.9806, "loss/crossentropy": 2.3193390369415283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21992851048707962, "step": 22734 }, { "epoch": 0.45472, "grad_norm": 2.09375, "grad_norm_var": 0.012996164957682292, "learning_rate": 0.0001, "loss": 4.3389, "loss/crossentropy": 2.520304322242737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229971744120121, "step": 22736 }, { "epoch": 0.45476, "grad_norm": 1.953125, "grad_norm_var": 0.012898763020833334, "learning_rate": 0.0001, "loss": 4.0665, "loss/crossentropy": 2.270485758781433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22266070544719696, "step": 22738 }, { "epoch": 0.4548, "grad_norm": 1.96875, "grad_norm_var": 0.011945597330729167, "learning_rate": 0.0001, "loss": 4.1006, "loss/crossentropy": 2.046397566795349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19661714136600494, "step": 22740 }, { "epoch": 0.45484, "grad_norm": 1.828125, "grad_norm_var": 0.012788645426432292, "learning_rate": 0.0001, "loss": 4.0564, "loss/crossentropy": 1.925121009349823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19327251613140106, "step": 22742 }, { "epoch": 0.45488, "grad_norm": 1.8671875, "grad_norm_var": 0.01253662109375, "learning_rate": 0.0001, "loss": 4.0178, "loss/crossentropy": 2.1238789558410645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18549078702926636, "step": 22744 }, { "epoch": 0.45492, "grad_norm": 1.8203125, "grad_norm_var": 0.01082763671875, "learning_rate": 0.0001, "loss": 3.8122, "loss/crossentropy": 1.8756769299507141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1634177565574646, "step": 22746 }, { "epoch": 0.45496, "grad_norm": 2.171875, "grad_norm_var": 0.012555948893229167, "learning_rate": 0.0001, "loss": 4.1419, "loss/crossentropy": 2.017587959766388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187566876411438, "step": 22748 }, { "epoch": 0.455, "grad_norm": 1.84375, "grad_norm_var": 0.011735026041666667, "learning_rate": 0.0001, "loss": 3.8677, "loss/crossentropy": 1.9583166241645813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20180843770503998, "step": 22750 }, { "epoch": 0.45504, "grad_norm": 1.8828125, "grad_norm_var": 0.008259836832682292, "learning_rate": 0.0001, "loss": 3.8475, "loss/crossentropy": 1.9483489990234375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19634968787431717, "step": 22752 }, { "epoch": 0.45508, "grad_norm": 1.890625, "grad_norm_var": 0.008270009358723959, "learning_rate": 0.0001, "loss": 3.8734, "loss/crossentropy": 1.727650761604309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17284797132015228, "step": 22754 }, { "epoch": 0.45512, "grad_norm": 1.765625, "grad_norm_var": 0.009262847900390624, "learning_rate": 0.0001, "loss": 3.9471, "loss/crossentropy": 2.306153416633606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20535488426685333, "step": 22756 }, { "epoch": 0.45516, "grad_norm": 1.90625, "grad_norm_var": 0.007755279541015625, "learning_rate": 0.0001, "loss": 4.0382, "loss/crossentropy": 1.8752743601799011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19387705624103546, "step": 22758 }, { "epoch": 0.4552, "grad_norm": 1.953125, "grad_norm_var": 0.007941691080729167, "learning_rate": 0.0001, "loss": 4.2708, "loss/crossentropy": 2.4626048803329468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21318693459033966, "step": 22760 }, { "epoch": 0.45524, "grad_norm": 1.796875, "grad_norm_var": 0.008503977457682292, "learning_rate": 0.0001, "loss": 3.7236, "loss/crossentropy": 1.968520700931549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19532684981822968, "step": 22762 }, { "epoch": 0.45528, "grad_norm": 2.015625, "grad_norm_var": 0.0135162353515625, "learning_rate": 0.0001, "loss": 4.0531, "loss/crossentropy": 1.75477135181427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17470379173755646, "step": 22764 }, { "epoch": 0.45532, "grad_norm": 2.0, "grad_norm_var": 0.017325846354166667, "learning_rate": 0.0001, "loss": 4.0837, "loss/crossentropy": 2.125413417816162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21318642050027847, "step": 22766 }, { "epoch": 0.45536, "grad_norm": 2.109375, "grad_norm_var": 0.01915257771809896, "learning_rate": 0.0001, "loss": 4.1433, "loss/crossentropy": 2.2569944858551025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204448863863945, "step": 22768 }, { "epoch": 0.4554, "grad_norm": 1.8828125, "grad_norm_var": 0.018379720052083333, "learning_rate": 0.0001, "loss": 3.943, "loss/crossentropy": 2.219534397125244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20692889392375946, "step": 22770 }, { "epoch": 0.45544, "grad_norm": 1.8984375, "grad_norm_var": 0.015372467041015626, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 2.1480907797813416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20739484578371048, "step": 22772 }, { "epoch": 0.45548, "grad_norm": 1.8359375, "grad_norm_var": 0.016379547119140626, "learning_rate": 0.0001, "loss": 3.8875, "loss/crossentropy": 1.9118182063102722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17766325920820236, "step": 22774 }, { "epoch": 0.45552, "grad_norm": 2.0, "grad_norm_var": 0.018900299072265626, "learning_rate": 0.0001, "loss": 3.9723, "loss/crossentropy": 2.0928712487220764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19869665801525116, "step": 22776 }, { "epoch": 0.45556, "grad_norm": 2.21875, "grad_norm_var": 0.022031402587890624, "learning_rate": 0.0001, "loss": 4.4284, "loss/crossentropy": 2.1542623043060303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22383734583854675, "step": 22778 }, { "epoch": 0.4556, "grad_norm": 1.9453125, "grad_norm_var": 0.019327545166015626, "learning_rate": 0.0001, "loss": 4.0714, "loss/crossentropy": 2.1210632920265198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19574230164289474, "step": 22780 }, { "epoch": 0.45564, "grad_norm": 1.9140625, "grad_norm_var": 0.01756566365559896, "learning_rate": 0.0001, "loss": 3.8178, "loss/crossentropy": 1.975026547908783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18365998566150665, "step": 22782 }, { "epoch": 0.45568, "grad_norm": 1.765625, "grad_norm_var": 0.017549641927083335, "learning_rate": 0.0001, "loss": 3.4904, "loss/crossentropy": 1.5320480465888977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15212352573871613, "step": 22784 }, { "epoch": 0.45572, "grad_norm": 1.953125, "grad_norm_var": 0.01838963826497396, "learning_rate": 0.0001, "loss": 3.9335, "loss/crossentropy": 1.8474896550178528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18486780673265457, "step": 22786 }, { "epoch": 0.45576, "grad_norm": 1.9765625, "grad_norm_var": 0.018553670247395834, "learning_rate": 0.0001, "loss": 4.037, "loss/crossentropy": 1.9554376006126404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18804097175598145, "step": 22788 }, { "epoch": 0.4558, "grad_norm": 2.0625, "grad_norm_var": 0.018968709309895835, "learning_rate": 0.0001, "loss": 4.1759, "loss/crossentropy": 2.110785663127899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21371331810951233, "step": 22790 }, { "epoch": 0.45584, "grad_norm": 1.9765625, "grad_norm_var": 0.016471099853515626, "learning_rate": 0.0001, "loss": 4.1691, "loss/crossentropy": 2.100754976272583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21454931795597076, "step": 22792 }, { "epoch": 0.45588, "grad_norm": 1.9375, "grad_norm_var": 0.006845855712890625, "learning_rate": 0.0001, "loss": 4.0916, "loss/crossentropy": 2.1975361704826355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2107257917523384, "step": 22794 }, { "epoch": 0.45592, "grad_norm": 2.046875, "grad_norm_var": 0.006961822509765625, "learning_rate": 0.0001, "loss": 4.1367, "loss/crossentropy": 2.216245174407959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22839964926242828, "step": 22796 }, { "epoch": 0.45596, "grad_norm": 1.8671875, "grad_norm_var": 0.007798004150390625, "learning_rate": 0.0001, "loss": 3.8334, "loss/crossentropy": 1.9227718710899353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18401919305324554, "step": 22798 }, { "epoch": 0.456, "grad_norm": 2.078125, "grad_norm_var": 0.007348378499348958, "learning_rate": 0.0001, "loss": 3.6818, "loss/crossentropy": 1.916443407535553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20578119158744812, "step": 22800 }, { "epoch": 0.45604, "grad_norm": 2.046875, "grad_norm_var": 0.006947580973307292, "learning_rate": 0.0001, "loss": 3.9813, "loss/crossentropy": 1.9856197834014893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20684893429279327, "step": 22802 }, { "epoch": 0.45608, "grad_norm": 1.9140625, "grad_norm_var": 0.00740966796875, "learning_rate": 0.0001, "loss": 4.2125, "loss/crossentropy": 2.284825623035431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19741405546665192, "step": 22804 }, { "epoch": 0.45612, "grad_norm": 1.859375, "grad_norm_var": 0.00848388671875, "learning_rate": 0.0001, "loss": 3.9399, "loss/crossentropy": 2.28286874294281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18636887520551682, "step": 22806 }, { "epoch": 0.45616, "grad_norm": 1.859375, "grad_norm_var": 0.008121490478515625, "learning_rate": 0.0001, "loss": 4.0123, "loss/crossentropy": 2.1754974722862244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18408793210983276, "step": 22808 }, { "epoch": 0.4562, "grad_norm": 2.15625, "grad_norm_var": 0.011668904622395834, "learning_rate": 0.0001, "loss": 3.9584, "loss/crossentropy": 1.9059696793556213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17439302802085876, "step": 22810 }, { "epoch": 0.45624, "grad_norm": 2.015625, "grad_norm_var": 0.011075846354166667, "learning_rate": 0.0001, "loss": 4.0839, "loss/crossentropy": 2.2676509618759155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20085486769676208, "step": 22812 }, { "epoch": 0.45628, "grad_norm": 1.953125, "grad_norm_var": 0.010788726806640624, "learning_rate": 0.0001, "loss": 3.9465, "loss/crossentropy": 2.1320562958717346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19100098311901093, "step": 22814 }, { "epoch": 0.45632, "grad_norm": 2.109375, "grad_norm_var": 0.0119293212890625, "learning_rate": 0.0001, "loss": 4.2211, "loss/crossentropy": 2.1827250719070435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999506950378418, "step": 22816 }, { "epoch": 0.45636, "grad_norm": 1.921875, "grad_norm_var": 0.01083984375, "learning_rate": 0.0001, "loss": 3.7179, "loss/crossentropy": 1.861267626285553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19007404148578644, "step": 22818 }, { "epoch": 0.4564, "grad_norm": 2.1875, "grad_norm_var": 0.014749908447265625, "learning_rate": 0.0001, "loss": 4.0594, "loss/crossentropy": 2.0799012184143066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22177287936210632, "step": 22820 }, { "epoch": 0.45644, "grad_norm": 2.125, "grad_norm_var": 0.013742828369140625, "learning_rate": 0.0001, "loss": 4.2555, "loss/crossentropy": 1.847994089126587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17684108018875122, "step": 22822 }, { "epoch": 0.45648, "grad_norm": 1.90625, "grad_norm_var": 0.011685943603515625, "learning_rate": 0.0001, "loss": 4.17, "loss/crossentropy": 1.8111347556114197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1796310842037201, "step": 22824 }, { "epoch": 0.45652, "grad_norm": 1.8125, "grad_norm_var": 0.011971028645833333, "learning_rate": 0.0001, "loss": 3.7134, "loss/crossentropy": 2.038852334022522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20186404883861542, "step": 22826 }, { "epoch": 0.45656, "grad_norm": 2.078125, "grad_norm_var": 0.09888509114583334, "learning_rate": 0.0001, "loss": 3.6857, "loss/crossentropy": 1.6407727003097534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1637558490037918, "step": 22828 }, { "epoch": 0.4566, "grad_norm": 2.046875, "grad_norm_var": 0.09705174763997396, "learning_rate": 0.0001, "loss": 4.2396, "loss/crossentropy": 2.240299344062805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22101856023073196, "step": 22830 }, { "epoch": 0.45664, "grad_norm": 1.9140625, "grad_norm_var": 0.10135269165039062, "learning_rate": 0.0001, "loss": 3.744, "loss/crossentropy": 2.0920748114585876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953132450580597, "step": 22832 }, { "epoch": 0.45668, "grad_norm": 2.015625, "grad_norm_var": 0.09823404947916667, "learning_rate": 0.0001, "loss": 4.4837, "loss/crossentropy": 2.5458264350891113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22436098754405975, "step": 22834 }, { "epoch": 0.45672, "grad_norm": 1.9375, "grad_norm_var": 0.1007232666015625, "learning_rate": 0.0001, "loss": 4.228, "loss/crossentropy": 2.2389872074127197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19549836963415146, "step": 22836 }, { "epoch": 0.45676, "grad_norm": 1.8828125, "grad_norm_var": 0.10216852823893229, "learning_rate": 0.0001, "loss": 4.0639, "loss/crossentropy": 1.943382740020752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17616666853427887, "step": 22838 }, { "epoch": 0.4568, "grad_norm": 1.7578125, "grad_norm_var": 0.10617574055989583, "learning_rate": 0.0001, "loss": 3.8807, "loss/crossentropy": 2.3038381338119507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21248196810483932, "step": 22840 }, { "epoch": 0.45684, "grad_norm": 1.7890625, "grad_norm_var": 0.10474446614583334, "learning_rate": 0.0001, "loss": 4.0004, "loss/crossentropy": 2.1889474391937256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21692103147506714, "step": 22842 }, { "epoch": 0.45688, "grad_norm": 1.8125, "grad_norm_var": 0.010959625244140625, "learning_rate": 0.0001, "loss": 3.8093, "loss/crossentropy": 1.7324257493019104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752893403172493, "step": 22844 }, { "epoch": 0.45692, "grad_norm": 1.953125, "grad_norm_var": 0.009358469645182292, "learning_rate": 0.0001, "loss": 3.7886, "loss/crossentropy": 1.5598450899124146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1739826500415802, "step": 22846 }, { "epoch": 0.45696, "grad_norm": 1.8828125, "grad_norm_var": 0.012699381510416666, "learning_rate": 0.0001, "loss": 4.1388, "loss/crossentropy": 2.629135251045227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23805684596300125, "step": 22848 }, { "epoch": 0.457, "grad_norm": 2.03125, "grad_norm_var": 0.0129058837890625, "learning_rate": 0.0001, "loss": 4.0862, "loss/crossentropy": 2.2966033220291138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20307788252830505, "step": 22850 }, { "epoch": 0.45704, "grad_norm": 2.109375, "grad_norm_var": 0.015372467041015626, "learning_rate": 0.0001, "loss": 3.8963, "loss/crossentropy": 2.111830711364746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220965214073658, "step": 22852 }, { "epoch": 0.45708, "grad_norm": 2.078125, "grad_norm_var": 0.016657511393229168, "learning_rate": 0.0001, "loss": 3.8466, "loss/crossentropy": 1.9285584688186646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810559406876564, "step": 22854 }, { "epoch": 0.45712, "grad_norm": 1.8671875, "grad_norm_var": 0.014798990885416667, "learning_rate": 0.0001, "loss": 3.9924, "loss/crossentropy": 2.059799551963806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19754227995872498, "step": 22856 }, { "epoch": 0.45716, "grad_norm": 1.96875, "grad_norm_var": 0.013330078125, "learning_rate": 0.0001, "loss": 4.1501, "loss/crossentropy": 2.219936490058899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20215890556573868, "step": 22858 }, { "epoch": 0.4572, "grad_norm": 2.078125, "grad_norm_var": 0.015541330973307291, "learning_rate": 0.0001, "loss": 3.6917, "loss/crossentropy": 1.639923632144928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15062974393367767, "step": 22860 }, { "epoch": 0.45724, "grad_norm": 1.9765625, "grad_norm_var": 0.013331858317057292, "learning_rate": 0.0001, "loss": 3.9225, "loss/crossentropy": 2.038732647895813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19826284795999527, "step": 22862 }, { "epoch": 0.45728, "grad_norm": 1.9296875, "grad_norm_var": 0.012969716389973959, "learning_rate": 0.0001, "loss": 3.7399, "loss/crossentropy": 1.9492093920707703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18503065407276154, "step": 22864 }, { "epoch": 0.45732, "grad_norm": 1.890625, "grad_norm_var": 0.011061350504557291, "learning_rate": 0.0001, "loss": 3.8676, "loss/crossentropy": 2.1745853424072266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20653871446847916, "step": 22866 }, { "epoch": 0.45736, "grad_norm": 1.9140625, "grad_norm_var": 0.008536529541015626, "learning_rate": 0.0001, "loss": 4.1377, "loss/crossentropy": 2.3043514490127563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20302630215883255, "step": 22868 }, { "epoch": 0.4574, "grad_norm": 1.953125, "grad_norm_var": 0.006730143229166667, "learning_rate": 0.0001, "loss": 4.0241, "loss/crossentropy": 2.032300651073456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972976177930832, "step": 22870 }, { "epoch": 0.45744, "grad_norm": 1.75, "grad_norm_var": 0.007783762613932292, "learning_rate": 0.0001, "loss": 3.9334, "loss/crossentropy": 1.7591362595558167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877521052956581, "step": 22872 }, { "epoch": 0.45748, "grad_norm": 2.109375, "grad_norm_var": 0.010179646809895833, "learning_rate": 0.0001, "loss": 3.9741, "loss/crossentropy": 1.8232863545417786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19946040213108063, "step": 22874 }, { "epoch": 0.45752, "grad_norm": 1.8515625, "grad_norm_var": 0.007627105712890625, "learning_rate": 0.0001, "loss": 4.1899, "loss/crossentropy": 2.3904630541801453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21164552122354507, "step": 22876 }, { "epoch": 0.45756, "grad_norm": 1.9375, "grad_norm_var": 0.007377115885416666, "learning_rate": 0.0001, "loss": 3.9767, "loss/crossentropy": 2.0109705328941345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19774679094552994, "step": 22878 }, { "epoch": 0.4576, "grad_norm": 1.9921875, "grad_norm_var": 0.006381988525390625, "learning_rate": 0.0001, "loss": 3.9764, "loss/crossentropy": 2.049817442893982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18809376657009125, "step": 22880 }, { "epoch": 0.45764, "grad_norm": 2.25, "grad_norm_var": 0.013444010416666667, "learning_rate": 0.0001, "loss": 3.9405, "loss/crossentropy": 1.9604635834693909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21582353115081787, "step": 22882 }, { "epoch": 0.45768, "grad_norm": 2.046875, "grad_norm_var": 0.021355946858723957, "learning_rate": 0.0001, "loss": 4.2544, "loss/crossentropy": 2.0224910378456116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21511054039001465, "step": 22884 }, { "epoch": 0.45772, "grad_norm": 2.046875, "grad_norm_var": 0.022191365559895832, "learning_rate": 0.0001, "loss": 4.1981, "loss/crossentropy": 2.1743668913841248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20319290459156036, "step": 22886 }, { "epoch": 0.45776, "grad_norm": 1.9765625, "grad_norm_var": 0.017160797119140626, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 2.0692490339279175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21043430268764496, "step": 22888 }, { "epoch": 0.4578, "grad_norm": 1.9375, "grad_norm_var": 0.01715672810872396, "learning_rate": 0.0001, "loss": 3.9975, "loss/crossentropy": 2.2976402044296265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21817487478256226, "step": 22890 }, { "epoch": 0.45784, "grad_norm": 1.828125, "grad_norm_var": 0.0191162109375, "learning_rate": 0.0001, "loss": 4.11, "loss/crossentropy": 1.9541950225830078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726100593805313, "step": 22892 }, { "epoch": 0.45788, "grad_norm": 1.7421875, "grad_norm_var": 0.022997029622395835, "learning_rate": 0.0001, "loss": 3.9245, "loss/crossentropy": 1.8908615112304688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17845968157052994, "step": 22894 }, { "epoch": 0.45792, "grad_norm": 1.8828125, "grad_norm_var": 0.023522694905598957, "learning_rate": 0.0001, "loss": 4.1408, "loss/crossentropy": 2.0752804279327393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17747191339731216, "step": 22896 }, { "epoch": 0.45796, "grad_norm": 1.9375, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 3.6761, "loss/crossentropy": 1.7240750789642334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17918948084115982, "step": 22898 }, { "epoch": 0.458, "grad_norm": 1.953125, "grad_norm_var": 0.011519114176432291, "learning_rate": 0.0001, "loss": 4.0632, "loss/crossentropy": 2.208250343799591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937680020928383, "step": 22900 }, { "epoch": 0.45804, "grad_norm": 1.90625, "grad_norm_var": 0.008478800455729166, "learning_rate": 0.0001, "loss": 4.1283, "loss/crossentropy": 2.076810359954834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18243352323770523, "step": 22902 }, { "epoch": 0.45808, "grad_norm": 1.9453125, "grad_norm_var": 0.006624094645182292, "learning_rate": 0.0001, "loss": 4.2553, "loss/crossentropy": 1.929943025112152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19618961960077286, "step": 22904 }, { "epoch": 0.45812, "grad_norm": 1.984375, "grad_norm_var": 0.0069976806640625, "learning_rate": 0.0001, "loss": 4.1791, "loss/crossentropy": 2.186367392539978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19272585958242416, "step": 22906 }, { "epoch": 0.45816, "grad_norm": 1.890625, "grad_norm_var": 0.0063385009765625, "learning_rate": 0.0001, "loss": 4.1156, "loss/crossentropy": 2.1537517309188843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1961517035961151, "step": 22908 }, { "epoch": 0.4582, "grad_norm": 2.0625, "grad_norm_var": 0.004325103759765625, "learning_rate": 0.0001, "loss": 4.1084, "loss/crossentropy": 2.0141521096229553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909116804599762, "step": 22910 }, { "epoch": 0.45824, "grad_norm": 1.90625, "grad_norm_var": 0.004351552327473958, "learning_rate": 0.0001, "loss": 4.1536, "loss/crossentropy": 1.9720463752746582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18549032509326935, "step": 22912 }, { "epoch": 0.45828, "grad_norm": 1.90625, "grad_norm_var": 0.002243804931640625, "learning_rate": 0.0001, "loss": 4.0425, "loss/crossentropy": 1.6521074771881104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.172331303358078, "step": 22914 }, { "epoch": 0.45832, "grad_norm": 1.78125, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 3.8738, "loss/crossentropy": 2.1382288932800293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402942061424255, "step": 22916 }, { "epoch": 0.45836, "grad_norm": 1.9453125, "grad_norm_var": 0.013688151041666667, "learning_rate": 0.0001, "loss": 4.0196, "loss/crossentropy": 2.348258137702942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21549250930547714, "step": 22918 }, { "epoch": 0.4584, "grad_norm": 2.296875, "grad_norm_var": 0.02235107421875, "learning_rate": 0.0001, "loss": 4.0723, "loss/crossentropy": 1.8907782435417175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18751342594623566, "step": 22920 }, { "epoch": 0.45844, "grad_norm": 1.953125, "grad_norm_var": 0.0222808837890625, "learning_rate": 0.0001, "loss": 4.0556, "loss/crossentropy": 2.189586043357849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954984813928604, "step": 22922 }, { "epoch": 0.45848, "grad_norm": 1.984375, "grad_norm_var": 0.021491495768229167, "learning_rate": 0.0001, "loss": 3.8171, "loss/crossentropy": 1.8710416555404663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1760418340563774, "step": 22924 }, { "epoch": 0.45852, "grad_norm": 1.90625, "grad_norm_var": 0.0213775634765625, "learning_rate": 0.0001, "loss": 4.4274, "loss/crossentropy": 2.3559741973876953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21578038483858109, "step": 22926 }, { "epoch": 0.45856, "grad_norm": 1.9609375, "grad_norm_var": 0.022739410400390625, "learning_rate": 0.0001, "loss": 3.7592, "loss/crossentropy": 1.8588098883628845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18936273455619812, "step": 22928 }, { "epoch": 0.4586, "grad_norm": 1.78125, "grad_norm_var": 0.024544270833333333, "learning_rate": 0.0001, "loss": 4.0378, "loss/crossentropy": 2.039194941520691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917971894145012, "step": 22930 }, { "epoch": 0.45864, "grad_norm": 1.828125, "grad_norm_var": 0.016739654541015624, "learning_rate": 0.0001, "loss": 3.7675, "loss/crossentropy": 1.7586663365364075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16597917675971985, "step": 22932 }, { "epoch": 0.45868, "grad_norm": 1.9453125, "grad_norm_var": 0.01702880859375, "learning_rate": 0.0001, "loss": 3.7682, "loss/crossentropy": 2.086498737335205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21375593543052673, "step": 22934 }, { "epoch": 0.45872, "grad_norm": 2.0, "grad_norm_var": 0.007853190104166666, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 2.14085054397583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2136429250240326, "step": 22936 }, { "epoch": 0.45876, "grad_norm": 2.671875, "grad_norm_var": 0.04417724609375, "learning_rate": 0.0001, "loss": 4.064, "loss/crossentropy": 1.7813395261764526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23206777125597, "step": 22938 }, { "epoch": 0.4588, "grad_norm": 1.9140625, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 3.8535, "loss/crossentropy": 2.267697334289551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19420506060123444, "step": 22940 }, { "epoch": 0.45884, "grad_norm": 2.015625, "grad_norm_var": 0.045328776041666664, "learning_rate": 0.0001, "loss": 4.1133, "loss/crossentropy": 1.9011476039886475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20060188323259354, "step": 22942 }, { "epoch": 0.45888, "grad_norm": 1.9609375, "grad_norm_var": 0.043822987874348955, "learning_rate": 0.0001, "loss": 4.0148, "loss/crossentropy": 2.0690804719924927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20410669595003128, "step": 22944 }, { "epoch": 0.45892, "grad_norm": 2.03125, "grad_norm_var": 0.0429443359375, "learning_rate": 0.0001, "loss": 4.2914, "loss/crossentropy": 2.05231511592865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200359046459198, "step": 22946 }, { "epoch": 0.45896, "grad_norm": 1.921875, "grad_norm_var": 0.04160741170247396, "learning_rate": 0.0001, "loss": 4.2998, "loss/crossentropy": 2.167905569076538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20565193891525269, "step": 22948 }, { "epoch": 0.459, "grad_norm": 1.921875, "grad_norm_var": 0.041290028889973955, "learning_rate": 0.0001, "loss": 3.911, "loss/crossentropy": 1.8297042846679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19305560737848282, "step": 22950 }, { "epoch": 0.45904, "grad_norm": 2.03125, "grad_norm_var": 0.042335764567057295, "learning_rate": 0.0001, "loss": 4.2231, "loss/crossentropy": 2.1130523681640625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063598930835724, "step": 22952 }, { "epoch": 0.45908, "grad_norm": 2.0, "grad_norm_var": 0.011889394124348958, "learning_rate": 0.0001, "loss": 4.0074, "loss/crossentropy": 1.8652898669242859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19195117056369781, "step": 22954 }, { "epoch": 0.45912, "grad_norm": 2.078125, "grad_norm_var": 0.009948476155598959, "learning_rate": 0.0001, "loss": 4.2163, "loss/crossentropy": 2.283188223838806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21027832478284836, "step": 22956 }, { "epoch": 0.45916, "grad_norm": 2.140625, "grad_norm_var": 0.010990397135416666, "learning_rate": 0.0001, "loss": 4.1242, "loss/crossentropy": 1.9148901104927063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20047999173402786, "step": 22958 }, { "epoch": 0.4592, "grad_norm": 1.8203125, "grad_norm_var": 0.013936360677083334, "learning_rate": 0.0001, "loss": 4.1457, "loss/crossentropy": 2.0488327741622925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18601303547620773, "step": 22960 }, { "epoch": 0.45924, "grad_norm": 1.9765625, "grad_norm_var": 0.014810943603515625, "learning_rate": 0.0001, "loss": 4.0482, "loss/crossentropy": 2.316429853439331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18690580874681473, "step": 22962 }, { "epoch": 0.45928, "grad_norm": 1.8359375, "grad_norm_var": 0.013260650634765624, "learning_rate": 0.0001, "loss": 3.8958, "loss/crossentropy": 2.2469218373298645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983572617173195, "step": 22964 }, { "epoch": 0.45932, "grad_norm": 1.828125, "grad_norm_var": 0.014964803059895834, "learning_rate": 0.0001, "loss": 3.7772, "loss/crossentropy": 1.988997757434845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17721036076545715, "step": 22966 }, { "epoch": 0.45936, "grad_norm": 1.8203125, "grad_norm_var": 0.016259511311848957, "learning_rate": 0.0001, "loss": 3.6331, "loss/crossentropy": 2.0831198692321777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20350365340709686, "step": 22968 }, { "epoch": 0.4594, "grad_norm": 1.875, "grad_norm_var": 0.016599273681640624, "learning_rate": 0.0001, "loss": 4.1053, "loss/crossentropy": 1.9105250239372253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24999283254146576, "step": 22970 }, { "epoch": 0.45944, "grad_norm": 1.859375, "grad_norm_var": 0.014851633707682292, "learning_rate": 0.0001, "loss": 3.96, "loss/crossentropy": 1.990349531173706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21390174329280853, "step": 22972 }, { "epoch": 0.45948, "grad_norm": 1.8125, "grad_norm_var": 0.011759440104166666, "learning_rate": 0.0001, "loss": 3.981, "loss/crossentropy": 1.8016277551651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16551107168197632, "step": 22974 }, { "epoch": 0.45952, "grad_norm": 1.921875, "grad_norm_var": 0.008925120035807291, "learning_rate": 0.0001, "loss": 3.7665, "loss/crossentropy": 2.0964725017547607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18953562527894974, "step": 22976 }, { "epoch": 0.45956, "grad_norm": 1.90625, "grad_norm_var": 0.008455149332682292, "learning_rate": 0.0001, "loss": 4.0136, "loss/crossentropy": 1.8826870322227478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19081643223762512, "step": 22978 }, { "epoch": 0.4596, "grad_norm": 1.8515625, "grad_norm_var": 0.009175618489583334, "learning_rate": 0.0001, "loss": 4.1068, "loss/crossentropy": 2.3349568843841553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133205682039261, "step": 22980 }, { "epoch": 0.45964, "grad_norm": 2.0625, "grad_norm_var": 0.009912109375, "learning_rate": 0.0001, "loss": 4.0551, "loss/crossentropy": 2.1611928939819336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18273824453353882, "step": 22982 }, { "epoch": 0.45968, "grad_norm": 2.0625, "grad_norm_var": 0.008969879150390625, "learning_rate": 0.0001, "loss": 4.1389, "loss/crossentropy": 1.9757606387138367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20463839918375015, "step": 22984 }, { "epoch": 0.45972, "grad_norm": 1.9609375, "grad_norm_var": 0.008320871988932292, "learning_rate": 0.0001, "loss": 3.9488, "loss/crossentropy": 2.075642466545105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19987941533327103, "step": 22986 }, { "epoch": 0.45976, "grad_norm": 2.0, "grad_norm_var": 0.009333292643229166, "learning_rate": 0.0001, "loss": 3.8588, "loss/crossentropy": 1.805062711238861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16294736042618752, "step": 22988 }, { "epoch": 0.4598, "grad_norm": 1.921875, "grad_norm_var": 0.008907063802083334, "learning_rate": 0.0001, "loss": 3.9069, "loss/crossentropy": 1.7681823372840881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16899758577346802, "step": 22990 }, { "epoch": 0.45984, "grad_norm": 2.0625, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 4.1451, "loss/crossentropy": 2.2006375789642334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2165016457438469, "step": 22992 }, { "epoch": 0.45988, "grad_norm": 1.9296875, "grad_norm_var": 0.009822591145833334, "learning_rate": 0.0001, "loss": 4.0045, "loss/crossentropy": 2.141044855117798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19580233097076416, "step": 22994 }, { "epoch": 0.45992, "grad_norm": 1.9453125, "grad_norm_var": 0.007818349202473958, "learning_rate": 0.0001, "loss": 4.1228, "loss/crossentropy": 1.8378351926803589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891382709145546, "step": 22996 }, { "epoch": 0.45996, "grad_norm": 1.9609375, "grad_norm_var": 0.00626220703125, "learning_rate": 0.0001, "loss": 4.0411, "loss/crossentropy": 1.9486631751060486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18766219913959503, "step": 22998 }, { "epoch": 0.46, "grad_norm": 2.03125, "grad_norm_var": 0.05397847493489583, "learning_rate": 0.0001, "loss": 4.0377, "loss/crossentropy": 2.2083182334899902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19954963773488998, "step": 23000 }, { "epoch": 0.46004, "grad_norm": 1.8671875, "grad_norm_var": 0.0544586181640625, "learning_rate": 0.0001, "loss": 3.8961, "loss/crossentropy": 2.163831114768982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008308321237564, "step": 23002 }, { "epoch": 0.46008, "grad_norm": 2.015625, "grad_norm_var": 0.053138987223307295, "learning_rate": 0.0001, "loss": 4.1704, "loss/crossentropy": 2.256836771965027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20936761796474457, "step": 23004 }, { "epoch": 0.46012, "grad_norm": 2.03125, "grad_norm_var": 0.051273345947265625, "learning_rate": 0.0001, "loss": 4.2633, "loss/crossentropy": 2.2618257999420166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2167700231075287, "step": 23006 }, { "epoch": 0.46016, "grad_norm": 2.734375, "grad_norm_var": 0.08671875, "learning_rate": 0.0001, "loss": 3.8584, "loss/crossentropy": 1.7830750346183777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17645323276519775, "step": 23008 }, { "epoch": 0.4602, "grad_norm": 2.046875, "grad_norm_var": 0.08814264933268229, "learning_rate": 0.0001, "loss": 4.0131, "loss/crossentropy": 2.027057111263275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19191700220108032, "step": 23010 }, { "epoch": 0.46024, "grad_norm": 1.875, "grad_norm_var": 0.09068094889322917, "learning_rate": 0.0001, "loss": 3.9818, "loss/crossentropy": 2.138069987297058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1890278086066246, "step": 23012 }, { "epoch": 0.46028, "grad_norm": 1.8359375, "grad_norm_var": 0.09110107421875, "learning_rate": 0.0001, "loss": 3.928, "loss/crossentropy": 1.8553322553634644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18419906497001648, "step": 23014 }, { "epoch": 0.46032, "grad_norm": 1.78125, "grad_norm_var": 0.049627431233723956, "learning_rate": 0.0001, "loss": 3.8015, "loss/crossentropy": 2.029839515686035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17909887433052063, "step": 23016 }, { "epoch": 0.46036, "grad_norm": 1.9140625, "grad_norm_var": 0.048868815104166664, "learning_rate": 0.0001, "loss": 3.9726, "loss/crossentropy": 2.2698041200637817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202784962952137, "step": 23018 }, { "epoch": 0.4604, "grad_norm": 2.203125, "grad_norm_var": 0.05064264933268229, "learning_rate": 0.0001, "loss": 4.2589, "loss/crossentropy": 2.1680142879486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947874128818512, "step": 23020 }, { "epoch": 0.46044, "grad_norm": 1.875, "grad_norm_var": 0.05204976399739583, "learning_rate": 0.0001, "loss": 4.0885, "loss/crossentropy": 2.1033952236175537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17692237347364426, "step": 23022 }, { "epoch": 0.46048, "grad_norm": 1.8984375, "grad_norm_var": 0.0103912353515625, "learning_rate": 0.0001, "loss": 4.2974, "loss/crossentropy": 2.3113337755203247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2220149040222168, "step": 23024 }, { "epoch": 0.46052, "grad_norm": 1.8515625, "grad_norm_var": 0.009291330973307291, "learning_rate": 0.0001, "loss": 3.76, "loss/crossentropy": 1.4061094522476196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16114307194948196, "step": 23026 }, { "epoch": 0.46056, "grad_norm": 1.7890625, "grad_norm_var": 0.010811106363932291, "learning_rate": 0.0001, "loss": 3.9877, "loss/crossentropy": 2.14698326587677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20559164136648178, "step": 23028 }, { "epoch": 0.4606, "grad_norm": 1.7734375, "grad_norm_var": 0.011787668863932291, "learning_rate": 0.0001, "loss": 3.9314, "loss/crossentropy": 2.255793571472168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18896941095590591, "step": 23030 }, { "epoch": 0.46064, "grad_norm": 1.8671875, "grad_norm_var": 0.01043701171875, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 2.186546564102173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044816091656685, "step": 23032 }, { "epoch": 0.46068, "grad_norm": 1.8984375, "grad_norm_var": 0.0108154296875, "learning_rate": 0.0001, "loss": 4.175, "loss/crossentropy": 2.285028338432312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20862022787332535, "step": 23034 }, { "epoch": 0.46072, "grad_norm": 1.9140625, "grad_norm_var": 0.0055908203125, "learning_rate": 0.0001, "loss": 4.3081, "loss/crossentropy": 2.1346707344055176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2030143365263939, "step": 23036 }, { "epoch": 0.46076, "grad_norm": 1.96875, "grad_norm_var": 0.0063168843587239586, "learning_rate": 0.0001, "loss": 4.043, "loss/crossentropy": 1.8645261526107788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1957538053393364, "step": 23038 }, { "epoch": 0.4608, "grad_norm": 1.84375, "grad_norm_var": 0.006048329671223958, "learning_rate": 0.0001, "loss": 3.8239, "loss/crossentropy": 1.9670527577400208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886385902762413, "step": 23040 }, { "epoch": 0.46084, "grad_norm": 2.03125, "grad_norm_var": 0.006644439697265625, "learning_rate": 0.0001, "loss": 3.9349, "loss/crossentropy": 2.1273213624954224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952996477484703, "step": 23042 }, { "epoch": 0.46088, "grad_norm": 1.921875, "grad_norm_var": 0.0060546875, "learning_rate": 0.0001, "loss": 3.947, "loss/crossentropy": 2.1550720930099487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20368672162294388, "step": 23044 }, { "epoch": 0.46092, "grad_norm": 1.828125, "grad_norm_var": 0.005830891927083333, "learning_rate": 0.0001, "loss": 4.054, "loss/crossentropy": 2.0871312618255615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20684774219989777, "step": 23046 }, { "epoch": 0.46096, "grad_norm": 1.859375, "grad_norm_var": 0.005980428059895833, "learning_rate": 0.0001, "loss": 3.9013, "loss/crossentropy": 2.1256298422813416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18924856930971146, "step": 23048 }, { "epoch": 0.461, "grad_norm": 2.3125, "grad_norm_var": 0.014422353108723958, "learning_rate": 0.0001, "loss": 4.3133, "loss/crossentropy": 1.9934388399124146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2362150102853775, "step": 23050 }, { "epoch": 0.46104, "grad_norm": 1.8828125, "grad_norm_var": 0.015478261311848958, "learning_rate": 0.0001, "loss": 4.0432, "loss/crossentropy": 1.9070496559143066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198696069419384, "step": 23052 }, { "epoch": 0.46108, "grad_norm": 1.890625, "grad_norm_var": 0.016403961181640624, "learning_rate": 0.0001, "loss": 3.647, "loss/crossentropy": 2.3180062770843506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19845713675022125, "step": 23054 }, { "epoch": 0.46112, "grad_norm": 2.34375, "grad_norm_var": 3.4797027587890623, "learning_rate": 0.0001, "loss": 3.9427, "loss/crossentropy": 1.5877657532691956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17085347324609756, "step": 23056 }, { "epoch": 0.46116, "grad_norm": 2.4375, "grad_norm_var": 3.4481992085774738, "learning_rate": 0.0001, "loss": 4.1123, "loss/crossentropy": 1.9232125282287598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18383993953466415, "step": 23058 }, { "epoch": 0.4612, "grad_norm": 1.9765625, "grad_norm_var": 3.442277018229167, "learning_rate": 0.0001, "loss": 4.0018, "loss/crossentropy": 1.7219146490097046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18319600075483322, "step": 23060 }, { "epoch": 0.46124, "grad_norm": 2.0625, "grad_norm_var": 3.425248209635417, "learning_rate": 0.0001, "loss": 4.0123, "loss/crossentropy": 2.274755835533142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20030079036951065, "step": 23062 }, { "epoch": 0.46128, "grad_norm": 2.015625, "grad_norm_var": 3.420116170247396, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 2.1552868485450745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19475380331277847, "step": 23064 }, { "epoch": 0.46132, "grad_norm": 1.9140625, "grad_norm_var": 3.4470842997233073, "learning_rate": 0.0001, "loss": 3.7861, "loss/crossentropy": 1.7763307094573975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16649360209703445, "step": 23066 }, { "epoch": 0.46136, "grad_norm": 2.0, "grad_norm_var": 3.4487945556640627, "learning_rate": 0.0001, "loss": 4.1083, "loss/crossentropy": 1.859923779964447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17723160982131958, "step": 23068 }, { "epoch": 0.4614, "grad_norm": 1.9453125, "grad_norm_var": 3.4543690999348957, "learning_rate": 0.0001, "loss": 3.9948, "loss/crossentropy": 1.9055342078208923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18139074742794037, "step": 23070 }, { "epoch": 0.46144, "grad_norm": 1.8203125, "grad_norm_var": 0.02443415323893229, "learning_rate": 0.0001, "loss": 4.1256, "loss/crossentropy": 2.2504982948303223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21642805635929108, "step": 23072 }, { "epoch": 0.46148, "grad_norm": 1.8671875, "grad_norm_var": 0.007469685872395834, "learning_rate": 0.0001, "loss": 4.2309, "loss/crossentropy": 2.142970085144043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985180348157883, "step": 23074 }, { "epoch": 0.46152, "grad_norm": 1.8203125, "grad_norm_var": 0.0076253255208333336, "learning_rate": 0.0001, "loss": 3.836, "loss/crossentropy": 2.0464539527893066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20277106016874313, "step": 23076 }, { "epoch": 0.46156, "grad_norm": 2.03125, "grad_norm_var": 0.007741038004557292, "learning_rate": 0.0001, "loss": 3.9323, "loss/crossentropy": 1.9818042516708374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055400162935257, "step": 23078 }, { "epoch": 0.4616, "grad_norm": 1.984375, "grad_norm_var": 0.0063555399576822914, "learning_rate": 0.0001, "loss": 3.9854, "loss/crossentropy": 2.245171070098877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21404191851615906, "step": 23080 }, { "epoch": 0.46164, "grad_norm": 2.0, "grad_norm_var": 0.009336090087890625, "learning_rate": 0.0001, "loss": 3.8036, "loss/crossentropy": 1.8196159601211548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19405725598335266, "step": 23082 }, { "epoch": 0.46168, "grad_norm": 1.9296875, "grad_norm_var": 0.0571197509765625, "learning_rate": 0.0001, "loss": 4.4681, "loss/crossentropy": 2.2700140476226807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2196115106344223, "step": 23084 }, { "epoch": 0.46172, "grad_norm": 1.859375, "grad_norm_var": 0.05880940755208333, "learning_rate": 0.0001, "loss": 3.7722, "loss/crossentropy": 1.8162717819213867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19593419879674911, "step": 23086 }, { "epoch": 0.46176, "grad_norm": 1.9375, "grad_norm_var": 0.057920074462890624, "learning_rate": 0.0001, "loss": 3.8489, "loss/crossentropy": 2.287248969078064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079424038529396, "step": 23088 }, { "epoch": 0.4618, "grad_norm": 1.953125, "grad_norm_var": 0.05727310180664062, "learning_rate": 0.0001, "loss": 4.1084, "loss/crossentropy": 2.004701316356659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19984427094459534, "step": 23090 }, { "epoch": 0.46184, "grad_norm": 1.953125, "grad_norm_var": 0.05663655598958333, "learning_rate": 0.0001, "loss": 4.3167, "loss/crossentropy": 2.185683250427246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210247203707695, "step": 23092 }, { "epoch": 0.46188, "grad_norm": 1.8359375, "grad_norm_var": 0.055425771077473956, "learning_rate": 0.0001, "loss": 4.0199, "loss/crossentropy": 2.050000488758087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19505897909402847, "step": 23094 }, { "epoch": 0.46192, "grad_norm": 1.953125, "grad_norm_var": 0.056703440348307294, "learning_rate": 0.0001, "loss": 3.9138, "loss/crossentropy": 2.0964688062667847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19070538878440857, "step": 23096 }, { "epoch": 0.46196, "grad_norm": 1.953125, "grad_norm_var": 0.05205256144205729, "learning_rate": 0.0001, "loss": 4.1354, "loss/crossentropy": 2.02596253156662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22631053626537323, "step": 23098 }, { "epoch": 0.462, "grad_norm": 1.9296875, "grad_norm_var": 0.004951985677083334, "learning_rate": 0.0001, "loss": 4.0223, "loss/crossentropy": 1.8759450912475586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18205147981643677, "step": 23100 }, { "epoch": 0.46204, "grad_norm": 1.9140625, "grad_norm_var": 0.004231516520182292, "learning_rate": 0.0001, "loss": 3.8365, "loss/crossentropy": 1.9460791945457458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17607779800891876, "step": 23102 }, { "epoch": 0.46208, "grad_norm": 2.265625, "grad_norm_var": 0.011031087239583333, "learning_rate": 0.0001, "loss": 3.9159, "loss/crossentropy": 1.5419431328773499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15973693877458572, "step": 23104 }, { "epoch": 0.46212, "grad_norm": 2.171875, "grad_norm_var": 0.014019521077473958, "learning_rate": 0.0001, "loss": 4.2898, "loss/crossentropy": 2.13347589969635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21076981723308563, "step": 23106 }, { "epoch": 0.46216, "grad_norm": 1.8515625, "grad_norm_var": 0.013948567708333333, "learning_rate": 0.0001, "loss": 4.2812, "loss/crossentropy": 2.199908971786499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933242455124855, "step": 23108 }, { "epoch": 0.4622, "grad_norm": 1.890625, "grad_norm_var": 0.013425445556640625, "learning_rate": 0.0001, "loss": 3.8662, "loss/crossentropy": 2.0715879797935486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933552771806717, "step": 23110 }, { "epoch": 0.46224, "grad_norm": 2.0, "grad_norm_var": 0.0129302978515625, "learning_rate": 0.0001, "loss": 3.9125, "loss/crossentropy": 1.8359448909759521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17814192920923233, "step": 23112 }, { "epoch": 0.46228, "grad_norm": 2.140625, "grad_norm_var": 0.014987945556640625, "learning_rate": 0.0001, "loss": 4.1782, "loss/crossentropy": 1.9806716442108154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18513696640729904, "step": 23114 }, { "epoch": 0.46232, "grad_norm": 1.9375, "grad_norm_var": 0.014725494384765624, "learning_rate": 0.0001, "loss": 3.9427, "loss/crossentropy": 1.647410809993744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.177697092294693, "step": 23116 }, { "epoch": 0.46236, "grad_norm": 1.734375, "grad_norm_var": 0.01885986328125, "learning_rate": 0.0001, "loss": 3.7102, "loss/crossentropy": 2.05259370803833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18241985887289047, "step": 23118 }, { "epoch": 0.4624, "grad_norm": 1.7890625, "grad_norm_var": 0.013874308268229166, "learning_rate": 0.0001, "loss": 3.9863, "loss/crossentropy": 2.229410469532013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21042390167713165, "step": 23120 }, { "epoch": 0.46244, "grad_norm": 1.875, "grad_norm_var": 0.010163370768229167, "learning_rate": 0.0001, "loss": 3.7858, "loss/crossentropy": 1.6477417945861816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15472649037837982, "step": 23122 }, { "epoch": 0.46248, "grad_norm": 1.9765625, "grad_norm_var": 0.009089914957682292, "learning_rate": 0.0001, "loss": 4.1175, "loss/crossentropy": 2.0088080167770386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18309780955314636, "step": 23124 }, { "epoch": 0.46252, "grad_norm": 1.8828125, "grad_norm_var": 0.009129842122395834, "learning_rate": 0.0001, "loss": 3.8097, "loss/crossentropy": 1.8413895964622498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828669160604477, "step": 23126 }, { "epoch": 0.46256, "grad_norm": 1.8125, "grad_norm_var": 0.009565989176432291, "learning_rate": 0.0001, "loss": 3.8763, "loss/crossentropy": 2.0812554955482483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836593672633171, "step": 23128 }, { "epoch": 0.4626, "grad_norm": 1.9375, "grad_norm_var": 0.006224568684895833, "learning_rate": 0.0001, "loss": 4.1576, "loss/crossentropy": 2.269462764263153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22045638412237167, "step": 23130 }, { "epoch": 0.46264, "grad_norm": 2.09375, "grad_norm_var": 0.008898671468098958, "learning_rate": 0.0001, "loss": 4.203, "loss/crossentropy": 2.1176013946533203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19339244812726974, "step": 23132 }, { "epoch": 0.46268, "grad_norm": 2.078125, "grad_norm_var": 0.008983357747395834, "learning_rate": 0.0001, "loss": 3.889, "loss/crossentropy": 1.8236181735992432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18049708008766174, "step": 23134 }, { "epoch": 0.46272, "grad_norm": 1.90625, "grad_norm_var": 0.007608795166015625, "learning_rate": 0.0001, "loss": 4.163, "loss/crossentropy": 1.7525643706321716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17853398621082306, "step": 23136 }, { "epoch": 0.46276, "grad_norm": 1.9765625, "grad_norm_var": 0.006135050455729167, "learning_rate": 0.0001, "loss": 4.3239, "loss/crossentropy": 2.2958520650863647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21427057683467865, "step": 23138 }, { "epoch": 0.4628, "grad_norm": 1.9296875, "grad_norm_var": 0.009016927083333333, "learning_rate": 0.0001, "loss": 3.6256, "loss/crossentropy": 1.8358682990074158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18813011050224304, "step": 23140 }, { "epoch": 0.46284, "grad_norm": 2.09375, "grad_norm_var": 0.010667928059895833, "learning_rate": 0.0001, "loss": 4.3444, "loss/crossentropy": 2.4191941022872925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2317579984664917, "step": 23142 }, { "epoch": 0.46288, "grad_norm": 1.9921875, "grad_norm_var": 0.012452952067057292, "learning_rate": 0.0001, "loss": 3.9152, "loss/crossentropy": 2.2656983137130737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20870012044906616, "step": 23144 }, { "epoch": 0.46292, "grad_norm": 1.9921875, "grad_norm_var": 0.014347076416015625, "learning_rate": 0.0001, "loss": 4.0893, "loss/crossentropy": 2.2222647666931152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072836235165596, "step": 23146 }, { "epoch": 0.46296, "grad_norm": 2.015625, "grad_norm_var": 0.012410481770833334, "learning_rate": 0.0001, "loss": 4.0469, "loss/crossentropy": 2.001325011253357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20486044138669968, "step": 23148 }, { "epoch": 0.463, "grad_norm": 1.84375, "grad_norm_var": 0.013061269124348959, "learning_rate": 0.0001, "loss": 4.0521, "loss/crossentropy": 2.3565926551818848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19294436275959015, "step": 23150 }, { "epoch": 0.46304, "grad_norm": 1.9140625, "grad_norm_var": 0.0134033203125, "learning_rate": 0.0001, "loss": 3.9513, "loss/crossentropy": 1.9006416201591492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20511633157730103, "step": 23152 }, { "epoch": 0.46308, "grad_norm": 1.8203125, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0001, "loss": 3.761, "loss/crossentropy": 2.017968237400055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877719983458519, "step": 23154 }, { "epoch": 0.46312, "grad_norm": 1.8828125, "grad_norm_var": 0.012833658854166667, "learning_rate": 0.0001, "loss": 4.0591, "loss/crossentropy": 2.093901038169861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863752081990242, "step": 23156 }, { "epoch": 0.46316, "grad_norm": 2.015625, "grad_norm_var": 0.011690266927083333, "learning_rate": 0.0001, "loss": 4.1145, "loss/crossentropy": 2.3631211519241333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2350725531578064, "step": 23158 }, { "epoch": 0.4632, "grad_norm": 1.9453125, "grad_norm_var": 0.007892862955729166, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 2.2482752799987793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18792778253555298, "step": 23160 }, { "epoch": 0.46324, "grad_norm": 1.90625, "grad_norm_var": 0.004115549723307291, "learning_rate": 0.0001, "loss": 4.0843, "loss/crossentropy": 2.462800145149231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21113361418247223, "step": 23162 }, { "epoch": 0.46328, "grad_norm": 1.9140625, "grad_norm_var": 0.003639475504557292, "learning_rate": 0.0001, "loss": 4.1277, "loss/crossentropy": 2.2522822618484497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163240611553192, "step": 23164 }, { "epoch": 0.46332, "grad_norm": 2.3125, "grad_norm_var": 0.013444010416666667, "learning_rate": 0.0001, "loss": 3.9615, "loss/crossentropy": 2.0585132837295532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20750011503696442, "step": 23166 }, { "epoch": 0.46336, "grad_norm": 2.09375, "grad_norm_var": 0.018357086181640624, "learning_rate": 0.0001, "loss": 4.0761, "loss/crossentropy": 2.0086065530776978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21273234486579895, "step": 23168 }, { "epoch": 0.4634, "grad_norm": 2.125, "grad_norm_var": 0.017040761311848958, "learning_rate": 0.0001, "loss": 3.7175, "loss/crossentropy": 1.8222439289093018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188787043094635, "step": 23170 }, { "epoch": 0.46344, "grad_norm": 2.046875, "grad_norm_var": 0.015843709309895832, "learning_rate": 0.0001, "loss": 4.0286, "loss/crossentropy": 1.9914604425430298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19186841696500778, "step": 23172 }, { "epoch": 0.46348, "grad_norm": 1.9609375, "grad_norm_var": 0.01594823201497396, "learning_rate": 0.0001, "loss": 4.022, "loss/crossentropy": 1.5708445310592651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16981858015060425, "step": 23174 }, { "epoch": 0.46352, "grad_norm": 1.953125, "grad_norm_var": 0.015900675455729166, "learning_rate": 0.0001, "loss": 3.981, "loss/crossentropy": 2.170333504676819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20143602043390274, "step": 23176 }, { "epoch": 0.46356, "grad_norm": 1.8515625, "grad_norm_var": 0.016033681233723958, "learning_rate": 0.0001, "loss": 4.0689, "loss/crossentropy": 1.8889789581298828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17962591350078583, "step": 23178 }, { "epoch": 0.4636, "grad_norm": 1.9296875, "grad_norm_var": 0.016917928059895834, "learning_rate": 0.0001, "loss": 3.877, "loss/crossentropy": 1.9203835129737854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20256394147872925, "step": 23180 }, { "epoch": 0.46364, "grad_norm": 1.9609375, "grad_norm_var": 0.009715779622395834, "learning_rate": 0.0001, "loss": 4.1076, "loss/crossentropy": 2.2325422763824463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23236138373613358, "step": 23182 }, { "epoch": 0.46368, "grad_norm": 1.8515625, "grad_norm_var": 0.010133616129557292, "learning_rate": 0.0001, "loss": 3.6904, "loss/crossentropy": 1.5864351987838745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15730705857276917, "step": 23184 }, { "epoch": 0.46372, "grad_norm": 2.0625, "grad_norm_var": 0.0185211181640625, "learning_rate": 0.0001, "loss": 4.4229, "loss/crossentropy": 2.5526922941207886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22684484720230103, "step": 23186 }, { "epoch": 0.46376, "grad_norm": 1.84375, "grad_norm_var": 0.01954930623372396, "learning_rate": 0.0001, "loss": 3.7298, "loss/crossentropy": 1.6752784252166748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16830473393201828, "step": 23188 }, { "epoch": 0.4638, "grad_norm": 1.8671875, "grad_norm_var": 0.019877115885416668, "learning_rate": 0.0001, "loss": 4.0933, "loss/crossentropy": 1.992654800415039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198849655687809, "step": 23190 }, { "epoch": 0.46384, "grad_norm": 2.03125, "grad_norm_var": 0.021541341145833334, "learning_rate": 0.0001, "loss": 4.1588, "loss/crossentropy": 1.932292878627777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19009747356176376, "step": 23192 }, { "epoch": 0.46388, "grad_norm": 2.046875, "grad_norm_var": 0.02211278279622396, "learning_rate": 0.0001, "loss": 4.2248, "loss/crossentropy": 2.362654209136963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087061107158661, "step": 23194 }, { "epoch": 0.46392, "grad_norm": 1.8828125, "grad_norm_var": 0.02215754191080729, "learning_rate": 0.0001, "loss": 3.932, "loss/crossentropy": 2.117887258529663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19435501843690872, "step": 23196 }, { "epoch": 0.46396, "grad_norm": 1.859375, "grad_norm_var": 0.0228912353515625, "learning_rate": 0.0001, "loss": 3.7937, "loss/crossentropy": 2.164400637149811, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19678650051355362, "step": 23198 }, { "epoch": 0.464, "grad_norm": 2.734375, "grad_norm_var": 0.05366795857747396, "learning_rate": 0.0001, "loss": 4.1249, "loss/crossentropy": 2.1662251949310303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19994695484638214, "step": 23200 }, { "epoch": 0.46404, "grad_norm": 2.046875, "grad_norm_var": 0.04720433553059896, "learning_rate": 0.0001, "loss": 4.246, "loss/crossentropy": 2.082236111164093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19977488368749619, "step": 23202 }, { "epoch": 0.46408, "grad_norm": 1.8515625, "grad_norm_var": 0.04742838541666667, "learning_rate": 0.0001, "loss": 4.0368, "loss/crossentropy": 2.12766033411026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18198081851005554, "step": 23204 }, { "epoch": 0.46412, "grad_norm": 1.90625, "grad_norm_var": 0.047907511393229164, "learning_rate": 0.0001, "loss": 3.7709, "loss/crossentropy": 2.0322113633155823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872313991189003, "step": 23206 }, { "epoch": 0.46416, "grad_norm": 1.8515625, "grad_norm_var": 0.04896214803059896, "learning_rate": 0.0001, "loss": 4.0456, "loss/crossentropy": 2.339892268180847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19740620255470276, "step": 23208 }, { "epoch": 0.4642, "grad_norm": 1.9609375, "grad_norm_var": 0.04987691243489583, "learning_rate": 0.0001, "loss": 3.7924, "loss/crossentropy": 1.989054560661316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2098187357187271, "step": 23210 }, { "epoch": 0.46424, "grad_norm": 1.90625, "grad_norm_var": 0.050065104166666666, "learning_rate": 0.0001, "loss": 3.8527, "loss/crossentropy": 2.0670509934425354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19233432412147522, "step": 23212 }, { "epoch": 0.46428, "grad_norm": 2.078125, "grad_norm_var": 0.04978815714518229, "learning_rate": 0.0001, "loss": 4.055, "loss/crossentropy": 2.136350452899933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21299083530902863, "step": 23214 }, { "epoch": 0.46432, "grad_norm": 1.9375, "grad_norm_var": 0.008333079020182292, "learning_rate": 0.0001, "loss": 4.2758, "loss/crossentropy": 2.1144350171089172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19434542953968048, "step": 23216 }, { "epoch": 0.46436, "grad_norm": 1.7578125, "grad_norm_var": 0.008605702718098959, "learning_rate": 0.0001, "loss": 3.8981, "loss/crossentropy": 2.307699203491211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19784536957740784, "step": 23218 }, { "epoch": 0.4644, "grad_norm": 1.8203125, "grad_norm_var": 0.00841064453125, "learning_rate": 0.0001, "loss": 3.7529, "loss/crossentropy": 1.8954175114631653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18727902323007584, "step": 23220 }, { "epoch": 0.46444, "grad_norm": 2.140625, "grad_norm_var": 0.011207834879557291, "learning_rate": 0.0001, "loss": 4.2463, "loss/crossentropy": 2.3020440340042114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126014232635498, "step": 23222 }, { "epoch": 0.46448, "grad_norm": 2.015625, "grad_norm_var": 0.010628255208333333, "learning_rate": 0.0001, "loss": 4.1424, "loss/crossentropy": 2.1159419417381287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19804084300994873, "step": 23224 }, { "epoch": 0.46452, "grad_norm": 1.859375, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 3.7152, "loss/crossentropy": 1.7309923768043518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19001224637031555, "step": 23226 }, { "epoch": 0.46456, "grad_norm": 1.9609375, "grad_norm_var": 0.009273274739583334, "learning_rate": 0.0001, "loss": 4.0433, "loss/crossentropy": 2.2450767755508423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20024073868989944, "step": 23228 }, { "epoch": 0.4646, "grad_norm": 1.96875, "grad_norm_var": 0.0081695556640625, "learning_rate": 0.0001, "loss": 4.1005, "loss/crossentropy": 1.8837137818336487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20254365354776382, "step": 23230 }, { "epoch": 0.46464, "grad_norm": 1.9140625, "grad_norm_var": 0.010825347900390626, "learning_rate": 0.0001, "loss": 3.9611, "loss/crossentropy": 1.773855447769165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17011478543281555, "step": 23232 }, { "epoch": 0.46468, "grad_norm": 2.34375, "grad_norm_var": 0.017479451497395833, "learning_rate": 0.0001, "loss": 4.1329, "loss/crossentropy": 2.2123981714248657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134469673037529, "step": 23234 }, { "epoch": 0.46472, "grad_norm": 1.9140625, "grad_norm_var": 0.015449778238932291, "learning_rate": 0.0001, "loss": 3.8216, "loss/crossentropy": 2.004405915737152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19117921590805054, "step": 23236 }, { "epoch": 0.46476, "grad_norm": 1.8671875, "grad_norm_var": 0.01505126953125, "learning_rate": 0.0001, "loss": 4.012, "loss/crossentropy": 2.2839083671569824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20231828093528748, "step": 23238 }, { "epoch": 0.4648, "grad_norm": 2.0, "grad_norm_var": 0.014769490559895833, "learning_rate": 0.0001, "loss": 3.9148, "loss/crossentropy": 2.196630358695984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992904543876648, "step": 23240 }, { "epoch": 0.46484, "grad_norm": 1.9375, "grad_norm_var": 0.013981119791666666, "learning_rate": 0.0001, "loss": 4.0372, "loss/crossentropy": 1.907668113708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16895314306020737, "step": 23242 }, { "epoch": 0.46488, "grad_norm": 1.8984375, "grad_norm_var": 0.01436767578125, "learning_rate": 0.0001, "loss": 4.2587, "loss/crossentropy": 2.1902048587799072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193833857774734, "step": 23244 }, { "epoch": 0.46492, "grad_norm": 2.015625, "grad_norm_var": 0.014427693684895833, "learning_rate": 0.0001, "loss": 4.084, "loss/crossentropy": 2.088558316230774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21183707565069199, "step": 23246 }, { "epoch": 0.46496, "grad_norm": 2.265625, "grad_norm_var": 0.018390909830729166, "learning_rate": 0.0001, "loss": 4.0602, "loss/crossentropy": 2.3056023120880127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21808549761772156, "step": 23248 }, { "epoch": 0.465, "grad_norm": 1.9140625, "grad_norm_var": 0.009235636393229166, "learning_rate": 0.0001, "loss": 4.2253, "loss/crossentropy": 2.255687952041626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19484181702136993, "step": 23250 }, { "epoch": 0.46504, "grad_norm": 1.9296875, "grad_norm_var": 0.009406534830729167, "learning_rate": 0.0001, "loss": 4.0991, "loss/crossentropy": 2.042704999446869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20347727835178375, "step": 23252 }, { "epoch": 0.46508, "grad_norm": 1.828125, "grad_norm_var": 0.009901682535807291, "learning_rate": 0.0001, "loss": 4.1159, "loss/crossentropy": 2.0365681648254395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19337813556194305, "step": 23254 }, { "epoch": 0.46512, "grad_norm": 1.8515625, "grad_norm_var": 0.010707346598307292, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 2.1936534643173218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163967713713646, "step": 23256 }, { "epoch": 0.46516, "grad_norm": 1.8671875, "grad_norm_var": 0.01199951171875, "learning_rate": 0.0001, "loss": 3.9913, "loss/crossentropy": 2.116168260574341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20038608461618423, "step": 23258 }, { "epoch": 0.4652, "grad_norm": 1.859375, "grad_norm_var": 0.012654622395833334, "learning_rate": 0.0001, "loss": 4.0231, "loss/crossentropy": 2.182048201560974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19984129816293716, "step": 23260 }, { "epoch": 0.46524, "grad_norm": 2.171875, "grad_norm_var": 0.018418121337890624, "learning_rate": 0.0001, "loss": 3.953, "loss/crossentropy": 2.1267359256744385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19032182544469833, "step": 23262 }, { "epoch": 0.46528, "grad_norm": 1.8125, "grad_norm_var": 0.0109527587890625, "learning_rate": 0.0001, "loss": 3.8849, "loss/crossentropy": 1.763411819934845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1661328673362732, "step": 23264 }, { "epoch": 0.46532, "grad_norm": 2.015625, "grad_norm_var": 0.011279042561848958, "learning_rate": 0.0001, "loss": 4.1665, "loss/crossentropy": 1.908778965473175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274333655834198, "step": 23266 }, { "epoch": 0.46536, "grad_norm": 1.890625, "grad_norm_var": 0.010994466145833333, "learning_rate": 0.0001, "loss": 4.0938, "loss/crossentropy": 2.2249021530151367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18010813742876053, "step": 23268 }, { "epoch": 0.4654, "grad_norm": 2.03125, "grad_norm_var": 0.013032786051432292, "learning_rate": 0.0001, "loss": 4.2546, "loss/crossentropy": 2.4366761445999146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22475503385066986, "step": 23270 }, { "epoch": 0.46544, "grad_norm": 1.8984375, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 4.2797, "loss/crossentropy": 2.2445744276046753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18397177010774612, "step": 23272 }, { "epoch": 0.46548, "grad_norm": 2.109375, "grad_norm_var": 0.0141021728515625, "learning_rate": 0.0001, "loss": 4.5025, "loss/crossentropy": 2.1187288761138916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064574956893921, "step": 23274 }, { "epoch": 0.46552, "grad_norm": 2.03125, "grad_norm_var": 0.0139892578125, "learning_rate": 0.0001, "loss": 4.0211, "loss/crossentropy": 1.9558793902397156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1832912713289261, "step": 23276 }, { "epoch": 0.46556, "grad_norm": 2.140625, "grad_norm_var": 0.009187571207682292, "learning_rate": 0.0001, "loss": 4.2, "loss/crossentropy": 2.125900387763977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23090820014476776, "step": 23278 }, { "epoch": 0.4656, "grad_norm": 1.7890625, "grad_norm_var": 0.009860992431640625, "learning_rate": 0.0001, "loss": 4.146, "loss/crossentropy": 2.337615966796875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21365046501159668, "step": 23280 }, { "epoch": 0.46564, "grad_norm": 1.8671875, "grad_norm_var": 0.011482747395833333, "learning_rate": 0.0001, "loss": 3.8832, "loss/crossentropy": 2.0574655532836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987999752163887, "step": 23282 }, { "epoch": 0.46568, "grad_norm": 1.890625, "grad_norm_var": 0.011250813802083334, "learning_rate": 0.0001, "loss": 3.8743, "loss/crossentropy": 1.8969943523406982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18804477155208588, "step": 23284 }, { "epoch": 0.46572, "grad_norm": 1.984375, "grad_norm_var": 0.010005442301432292, "learning_rate": 0.0001, "loss": 4.2682, "loss/crossentropy": 2.218023180961609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202347069978714, "step": 23286 }, { "epoch": 0.46576, "grad_norm": 2.0, "grad_norm_var": 0.010019683837890625, "learning_rate": 0.0001, "loss": 4.0489, "loss/crossentropy": 2.2892422676086426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21481822431087494, "step": 23288 }, { "epoch": 0.4658, "grad_norm": 1.96875, "grad_norm_var": 0.0074859619140625, "learning_rate": 0.0001, "loss": 4.1356, "loss/crossentropy": 1.9342178106307983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18514516204595566, "step": 23290 }, { "epoch": 0.46584, "grad_norm": 1.8046875, "grad_norm_var": 0.0075762430826822914, "learning_rate": 0.0001, "loss": 3.9764, "loss/crossentropy": 2.134042203426361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20004402846097946, "step": 23292 }, { "epoch": 0.46588, "grad_norm": 1.9609375, "grad_norm_var": 0.006422678629557292, "learning_rate": 0.0001, "loss": 4.3574, "loss/crossentropy": 2.075138568878174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18759854137897491, "step": 23294 }, { "epoch": 0.46592, "grad_norm": 1.9140625, "grad_norm_var": 0.005260976155598959, "learning_rate": 0.0001, "loss": 4.0339, "loss/crossentropy": 2.0511878728866577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19307290762662888, "step": 23296 }, { "epoch": 0.46596, "grad_norm": 1.9453125, "grad_norm_var": 0.004484049479166667, "learning_rate": 0.0001, "loss": 4.31, "loss/crossentropy": 2.2446112632751465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207113802433014, "step": 23298 }, { "epoch": 0.466, "grad_norm": 1.875, "grad_norm_var": 0.0057769775390625, "learning_rate": 0.0001, "loss": 4.1328, "loss/crossentropy": 2.0350372195243835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17964959889650345, "step": 23300 }, { "epoch": 0.46604, "grad_norm": 2.234375, "grad_norm_var": 0.011844635009765625, "learning_rate": 0.0001, "loss": 4.4, "loss/crossentropy": 2.3412392139434814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23192551732063293, "step": 23302 }, { "epoch": 0.46608, "grad_norm": 1.859375, "grad_norm_var": 0.013456217447916667, "learning_rate": 0.0001, "loss": 4.1086, "loss/crossentropy": 2.1255269050598145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22011695802211761, "step": 23304 }, { "epoch": 0.46612, "grad_norm": 1.8984375, "grad_norm_var": 0.013818105061848959, "learning_rate": 0.0001, "loss": 4.1335, "loss/crossentropy": 2.2372154593467712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20802992582321167, "step": 23306 }, { "epoch": 0.46616, "grad_norm": 1.8515625, "grad_norm_var": 0.013051096598307292, "learning_rate": 0.0001, "loss": 3.9148, "loss/crossentropy": 1.766309678554535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1731080263853073, "step": 23308 }, { "epoch": 0.4662, "grad_norm": 2.109375, "grad_norm_var": 0.013354237874348958, "learning_rate": 0.0001, "loss": 4.2245, "loss/crossentropy": 2.035153806209564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21127209067344666, "step": 23310 }, { "epoch": 0.46624, "grad_norm": 1.859375, "grad_norm_var": 0.014525349934895833, "learning_rate": 0.0001, "loss": 3.8472, "loss/crossentropy": 2.161786377429962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059953212738037, "step": 23312 }, { "epoch": 0.46628, "grad_norm": 1.859375, "grad_norm_var": 0.015195465087890625, "learning_rate": 0.0001, "loss": 4.0489, "loss/crossentropy": 2.2908066511154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20963146537542343, "step": 23314 }, { "epoch": 0.46632, "grad_norm": 1.859375, "grad_norm_var": 0.015885162353515624, "learning_rate": 0.0001, "loss": 3.9638, "loss/crossentropy": 2.1423317193984985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051083892583847, "step": 23316 }, { "epoch": 0.46636, "grad_norm": 1.9140625, "grad_norm_var": 0.009303538004557292, "learning_rate": 0.0001, "loss": 3.9683, "loss/crossentropy": 1.953084647655487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18355616927146912, "step": 23318 }, { "epoch": 0.4664, "grad_norm": 2.078125, "grad_norm_var": 0.008348592122395833, "learning_rate": 0.0001, "loss": 4.4509, "loss/crossentropy": 2.3963130712509155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21937239170074463, "step": 23320 }, { "epoch": 0.46644, "grad_norm": 1.828125, "grad_norm_var": 0.009419759114583334, "learning_rate": 0.0001, "loss": 3.8771, "loss/crossentropy": 1.991708517074585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18753455579280853, "step": 23322 }, { "epoch": 0.46648, "grad_norm": 2.125, "grad_norm_var": 0.013142903645833334, "learning_rate": 0.0001, "loss": 4.4012, "loss/crossentropy": 2.1105082035064697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21250775456428528, "step": 23324 }, { "epoch": 0.46652, "grad_norm": 2.03125, "grad_norm_var": 0.011885579427083333, "learning_rate": 0.0001, "loss": 4.2301, "loss/crossentropy": 2.3817760944366455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22308632731437683, "step": 23326 }, { "epoch": 0.46656, "grad_norm": 2.0, "grad_norm_var": 0.010762532552083334, "learning_rate": 0.0001, "loss": 3.9596, "loss/crossentropy": 1.9623343348503113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189848855137825, "step": 23328 }, { "epoch": 0.4666, "grad_norm": 1.9765625, "grad_norm_var": 0.011156209309895833, "learning_rate": 0.0001, "loss": 4.0722, "loss/crossentropy": 2.2103809118270874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21060046553611755, "step": 23330 }, { "epoch": 0.46664, "grad_norm": 2.046875, "grad_norm_var": 0.0101715087890625, "learning_rate": 0.0001, "loss": 3.915, "loss/crossentropy": 2.0628533959388733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19960719347000122, "step": 23332 }, { "epoch": 0.46668, "grad_norm": 1.90625, "grad_norm_var": 0.01181640625, "learning_rate": 0.0001, "loss": 3.9823, "loss/crossentropy": 1.8410035371780396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17858797311782837, "step": 23334 }, { "epoch": 0.46672, "grad_norm": 1.9921875, "grad_norm_var": 0.010786946614583333, "learning_rate": 0.0001, "loss": 4.0117, "loss/crossentropy": 2.056548833847046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881648674607277, "step": 23336 }, { "epoch": 0.46676, "grad_norm": 1.78125, "grad_norm_var": 0.012962849934895833, "learning_rate": 0.0001, "loss": 3.8496, "loss/crossentropy": 1.8516274094581604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19123712182044983, "step": 23338 }, { "epoch": 0.4668, "grad_norm": 1.9609375, "grad_norm_var": 0.009137980143229167, "learning_rate": 0.0001, "loss": 4.1999, "loss/crossentropy": 2.1613941192626953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937795951962471, "step": 23340 }, { "epoch": 0.46684, "grad_norm": 2.0, "grad_norm_var": 0.010106404622395834, "learning_rate": 0.0001, "loss": 4.052, "loss/crossentropy": 1.9474772810935974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17672593146562576, "step": 23342 }, { "epoch": 0.46688, "grad_norm": 1.7734375, "grad_norm_var": 0.01136474609375, "learning_rate": 0.0001, "loss": 3.9052, "loss/crossentropy": 1.8008830547332764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1621333733201027, "step": 23344 }, { "epoch": 0.46692, "grad_norm": 2.140625, "grad_norm_var": 0.013768513997395834, "learning_rate": 0.0001, "loss": 4.385, "loss/crossentropy": 2.505362868309021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21897228807210922, "step": 23346 }, { "epoch": 0.46696, "grad_norm": 1.8671875, "grad_norm_var": 0.013702138264973959, "learning_rate": 0.0001, "loss": 3.9262, "loss/crossentropy": 1.7621804475784302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18244817852973938, "step": 23348 }, { "epoch": 0.467, "grad_norm": 1.8828125, "grad_norm_var": 0.012630208333333334, "learning_rate": 0.0001, "loss": 4.0277, "loss/crossentropy": 1.8438073992729187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19016395509243011, "step": 23350 }, { "epoch": 0.46704, "grad_norm": 1.953125, "grad_norm_var": 0.012630208333333334, "learning_rate": 0.0001, "loss": 3.7829, "loss/crossentropy": 1.9118491411209106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19326411187648773, "step": 23352 }, { "epoch": 0.46708, "grad_norm": 1.9765625, "grad_norm_var": 0.009806315104166666, "learning_rate": 0.0001, "loss": 3.9878, "loss/crossentropy": 1.911489188671112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17859935760498047, "step": 23354 }, { "epoch": 0.46712, "grad_norm": 1.9921875, "grad_norm_var": 0.014235178629557291, "learning_rate": 0.0001, "loss": 4.1033, "loss/crossentropy": 2.107007384300232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20303775370121002, "step": 23356 }, { "epoch": 0.46716, "grad_norm": 1.671875, "grad_norm_var": 0.017659505208333332, "learning_rate": 0.0001, "loss": 3.7504, "loss/crossentropy": 2.242035746574402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20925073325634003, "step": 23358 }, { "epoch": 0.4672, "grad_norm": 1.796875, "grad_norm_var": 0.016364542643229167, "learning_rate": 0.0001, "loss": 3.9543, "loss/crossentropy": 1.9015487432479858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20830972492694855, "step": 23360 }, { "epoch": 0.46724, "grad_norm": 1.8984375, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 3.8229, "loss/crossentropy": 1.8648836612701416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17992950975894928, "step": 23362 }, { "epoch": 0.46728, "grad_norm": 1.96875, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 4.026, "loss/crossentropy": 1.7606803178787231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17884241789579391, "step": 23364 }, { "epoch": 0.46732, "grad_norm": 2.171875, "grad_norm_var": 0.016635894775390625, "learning_rate": 0.0001, "loss": 3.9579, "loss/crossentropy": 2.115469813346863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2030588760972023, "step": 23366 }, { "epoch": 0.46736, "grad_norm": 1.9375, "grad_norm_var": 0.017814127604166667, "learning_rate": 0.0001, "loss": 3.9706, "loss/crossentropy": 1.9341481924057007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17974573373794556, "step": 23368 }, { "epoch": 0.4674, "grad_norm": 2.03125, "grad_norm_var": 0.017317454020182293, "learning_rate": 0.0001, "loss": 4.0494, "loss/crossentropy": 2.171218752861023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20550543069839478, "step": 23370 }, { "epoch": 0.46744, "grad_norm": 2.015625, "grad_norm_var": 0.013392893473307292, "learning_rate": 0.0001, "loss": 4.0207, "loss/crossentropy": 2.0552384853363037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19348255544900894, "step": 23372 }, { "epoch": 0.46748, "grad_norm": 1.8203125, "grad_norm_var": 0.010643513997395833, "learning_rate": 0.0001, "loss": 3.6074, "loss/crossentropy": 1.7817274332046509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16803114861249924, "step": 23374 }, { "epoch": 0.46752, "grad_norm": 2.015625, "grad_norm_var": 0.010245513916015626, "learning_rate": 0.0001, "loss": 4.2458, "loss/crossentropy": 2.0412665009498596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20734214782714844, "step": 23376 }, { "epoch": 0.46756, "grad_norm": 1.9609375, "grad_norm_var": 0.012520090738932291, "learning_rate": 0.0001, "loss": 4.0561, "loss/crossentropy": 2.0934815406799316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062682881951332, "step": 23378 }, { "epoch": 0.4676, "grad_norm": 2.140625, "grad_norm_var": 0.015720367431640625, "learning_rate": 0.0001, "loss": 3.9964, "loss/crossentropy": 2.076676905155182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20668631047010422, "step": 23380 }, { "epoch": 0.46764, "grad_norm": 2.0625, "grad_norm_var": 0.0129302978515625, "learning_rate": 0.0001, "loss": 4.2509, "loss/crossentropy": 2.185506582260132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21427924185991287, "step": 23382 }, { "epoch": 0.46768, "grad_norm": 1.984375, "grad_norm_var": 0.011771392822265626, "learning_rate": 0.0001, "loss": 3.865, "loss/crossentropy": 2.0467058420181274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19394882023334503, "step": 23384 }, { "epoch": 0.46772, "grad_norm": 2.140625, "grad_norm_var": 0.013304646809895833, "learning_rate": 0.0001, "loss": 4.2221, "loss/crossentropy": 2.4391517639160156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22828804701566696, "step": 23386 }, { "epoch": 0.46776, "grad_norm": 2.03125, "grad_norm_var": 0.0128814697265625, "learning_rate": 0.0001, "loss": 4.1389, "loss/crossentropy": 2.2490856647491455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21296796947717667, "step": 23388 }, { "epoch": 0.4678, "grad_norm": 1.9765625, "grad_norm_var": 0.011810048421223959, "learning_rate": 0.0001, "loss": 3.7515, "loss/crossentropy": 1.5810586214065552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17569316178560257, "step": 23390 }, { "epoch": 0.46784, "grad_norm": 1.953125, "grad_norm_var": 0.011805979410807292, "learning_rate": 0.0001, "loss": 3.9116, "loss/crossentropy": 2.069303274154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18829242885112762, "step": 23392 }, { "epoch": 0.46788, "grad_norm": 2.09375, "grad_norm_var": 0.011004384358723958, "learning_rate": 0.0001, "loss": 4.1711, "loss/crossentropy": 1.9410834312438965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875254362821579, "step": 23394 }, { "epoch": 0.46792, "grad_norm": 1.7421875, "grad_norm_var": 0.011498769124348959, "learning_rate": 0.0001, "loss": 3.697, "loss/crossentropy": 2.1052953004837036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19472668319940567, "step": 23396 }, { "epoch": 0.46796, "grad_norm": 1.875, "grad_norm_var": 0.011250813802083334, "learning_rate": 0.0001, "loss": 4.0819, "loss/crossentropy": 2.1837843656539917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20570768415927887, "step": 23398 }, { "epoch": 0.468, "grad_norm": 1.71875, "grad_norm_var": 0.014229075113932291, "learning_rate": 0.0001, "loss": 3.7475, "loss/crossentropy": 1.980469524860382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19470104575157166, "step": 23400 }, { "epoch": 0.46804, "grad_norm": 1.890625, "grad_norm_var": 0.011472320556640625, "learning_rate": 0.0001, "loss": 3.9931, "loss/crossentropy": 1.992879033088684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19029226154088974, "step": 23402 }, { "epoch": 0.46808, "grad_norm": 1.953125, "grad_norm_var": 0.010689036051432291, "learning_rate": 0.0001, "loss": 4.085, "loss/crossentropy": 2.5265753269195557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20238864421844482, "step": 23404 }, { "epoch": 0.46812, "grad_norm": 1.9296875, "grad_norm_var": 0.008943430582682292, "learning_rate": 0.0001, "loss": 3.9541, "loss/crossentropy": 2.1182003021240234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22061780095100403, "step": 23406 }, { "epoch": 0.46816, "grad_norm": 1.90625, "grad_norm_var": 0.008381144205729166, "learning_rate": 0.0001, "loss": 3.9054, "loss/crossentropy": 1.8726989030838013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18806783854961395, "step": 23408 }, { "epoch": 0.4682, "grad_norm": 2.46875, "grad_norm_var": 0.026371256510416666, "learning_rate": 0.0001, "loss": 4.0764, "loss/crossentropy": 2.3275599479675293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21061742305755615, "step": 23410 }, { "epoch": 0.46824, "grad_norm": 2.140625, "grad_norm_var": 0.02622044881184896, "learning_rate": 0.0001, "loss": 3.8391, "loss/crossentropy": 1.9331459999084473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18973465263843536, "step": 23412 }, { "epoch": 0.46828, "grad_norm": 1.765625, "grad_norm_var": 0.028043365478515624, "learning_rate": 0.0001, "loss": 3.7524, "loss/crossentropy": 1.9546124339103699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17560599744319916, "step": 23414 }, { "epoch": 0.46832, "grad_norm": 2.21875, "grad_norm_var": 0.03088353474934896, "learning_rate": 0.0001, "loss": 4.2547, "loss/crossentropy": 2.5048669576644897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21813815087080002, "step": 23416 }, { "epoch": 0.46836, "grad_norm": 1.9765625, "grad_norm_var": 0.029930623372395833, "learning_rate": 0.0001, "loss": 4.0534, "loss/crossentropy": 1.8067769408226013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18293478339910507, "step": 23418 }, { "epoch": 0.4684, "grad_norm": 1.828125, "grad_norm_var": 0.031981404622395834, "learning_rate": 0.0001, "loss": 3.7527, "loss/crossentropy": 1.6894381642341614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17101695388555527, "step": 23420 }, { "epoch": 0.46844, "grad_norm": 1.9609375, "grad_norm_var": 0.03349177042643229, "learning_rate": 0.0001, "loss": 3.6362, "loss/crossentropy": 2.3269423246383667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20582617074251175, "step": 23422 }, { "epoch": 0.46848, "grad_norm": 1.9921875, "grad_norm_var": 0.03337376912434896, "learning_rate": 0.0001, "loss": 3.9875, "loss/crossentropy": 1.9424707293510437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18869344890117645, "step": 23424 }, { "epoch": 0.46852, "grad_norm": 1.9296875, "grad_norm_var": 0.01929499308268229, "learning_rate": 0.0001, "loss": 3.927, "loss/crossentropy": 1.8979859948158264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17844338715076447, "step": 23426 }, { "epoch": 0.46856, "grad_norm": 1.9140625, "grad_norm_var": 0.0167236328125, "learning_rate": 0.0001, "loss": 4.3855, "loss/crossentropy": 2.2464778423309326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18301833420991898, "step": 23428 }, { "epoch": 0.4686, "grad_norm": 1.7109375, "grad_norm_var": 0.02848078409830729, "learning_rate": 0.0001, "loss": 3.8957, "loss/crossentropy": 1.9471614360809326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20326492190361023, "step": 23430 }, { "epoch": 0.46864, "grad_norm": 1.9609375, "grad_norm_var": 0.02283299763997396, "learning_rate": 0.0001, "loss": 4.0045, "loss/crossentropy": 2.04762601852417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19779526442289352, "step": 23432 }, { "epoch": 0.46868, "grad_norm": 1.8984375, "grad_norm_var": 0.022607167561848957, "learning_rate": 0.0001, "loss": 4.1019, "loss/crossentropy": 2.1162944436073303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19175396859645844, "step": 23434 }, { "epoch": 0.46872, "grad_norm": 2.21875, "grad_norm_var": 0.025567372639973957, "learning_rate": 0.0001, "loss": 3.9651, "loss/crossentropy": 1.6130013465881348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16156645864248276, "step": 23436 }, { "epoch": 0.46876, "grad_norm": 2.046875, "grad_norm_var": 0.023423004150390624, "learning_rate": 0.0001, "loss": 3.9582, "loss/crossentropy": 2.0393518805503845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20742176473140717, "step": 23438 }, { "epoch": 0.4688, "grad_norm": 1.90625, "grad_norm_var": 0.023811848958333333, "learning_rate": 0.0001, "loss": 3.7042, "loss/crossentropy": 1.6487592458724976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16911844164133072, "step": 23440 }, { "epoch": 0.46884, "grad_norm": 1.7890625, "grad_norm_var": 0.023034413655598957, "learning_rate": 0.0001, "loss": 3.8824, "loss/crossentropy": 1.8171139359474182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18819218128919601, "step": 23442 }, { "epoch": 0.46888, "grad_norm": 1.921875, "grad_norm_var": 0.022981770833333335, "learning_rate": 0.0001, "loss": 4.0849, "loss/crossentropy": 1.669542133808136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18253905326128006, "step": 23444 }, { "epoch": 0.46892, "grad_norm": 1.96875, "grad_norm_var": 0.008845011393229166, "learning_rate": 0.0001, "loss": 3.9415, "loss/crossentropy": 1.8465211391448975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.194318987429142, "step": 23446 }, { "epoch": 0.46896, "grad_norm": 2.0625, "grad_norm_var": 0.011742146809895833, "learning_rate": 0.0001, "loss": 4.0606, "loss/crossentropy": 2.4016542434692383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081214338541031, "step": 23448 }, { "epoch": 0.469, "grad_norm": 1.7890625, "grad_norm_var": 0.013692220052083334, "learning_rate": 0.0001, "loss": 4.062, "loss/crossentropy": 2.2425581216812134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19481270760297775, "step": 23450 }, { "epoch": 0.46904, "grad_norm": 1.9765625, "grad_norm_var": 0.009590657552083333, "learning_rate": 0.0001, "loss": 3.9642, "loss/crossentropy": 1.8975054621696472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20339980721473694, "step": 23452 }, { "epoch": 0.46908, "grad_norm": 2.328125, "grad_norm_var": 0.01871312459309896, "learning_rate": 0.0001, "loss": 4.1964, "loss/crossentropy": 2.274298667907715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22013512253761292, "step": 23454 }, { "epoch": 0.46912, "grad_norm": 1.9609375, "grad_norm_var": 0.019724273681640626, "learning_rate": 0.0001, "loss": 3.7991, "loss/crossentropy": 2.286523461341858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21875084191560745, "step": 23456 }, { "epoch": 0.46916, "grad_norm": 1.953125, "grad_norm_var": 0.017465972900390626, "learning_rate": 0.0001, "loss": 4.1091, "loss/crossentropy": 2.199851155281067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20098759233951569, "step": 23458 }, { "epoch": 0.4692, "grad_norm": 2.3125, "grad_norm_var": 0.025780232747395833, "learning_rate": 0.0001, "loss": 3.76, "loss/crossentropy": 1.896471917629242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18724198639392853, "step": 23460 }, { "epoch": 0.46924, "grad_norm": 1.8671875, "grad_norm_var": 0.026879628499348957, "learning_rate": 0.0001, "loss": 4.0191, "loss/crossentropy": 2.0206944942474365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1795445680618286, "step": 23462 }, { "epoch": 0.46928, "grad_norm": 1.8046875, "grad_norm_var": 0.025911458333333335, "learning_rate": 0.0001, "loss": 4.1516, "loss/crossentropy": 2.230253279209137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20442913472652435, "step": 23464 }, { "epoch": 0.46932, "grad_norm": 1.96875, "grad_norm_var": 0.025055948893229166, "learning_rate": 0.0001, "loss": 4.0306, "loss/crossentropy": 2.117598056793213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20652078092098236, "step": 23466 }, { "epoch": 0.46936, "grad_norm": 1.953125, "grad_norm_var": 0.0245758056640625, "learning_rate": 0.0001, "loss": 4.0638, "loss/crossentropy": 1.6677707433700562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15297261625528336, "step": 23468 }, { "epoch": 0.4694, "grad_norm": 1.8984375, "grad_norm_var": 0.014875284830729167, "learning_rate": 0.0001, "loss": 4.0632, "loss/crossentropy": 2.039306938648224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21778366714715958, "step": 23470 }, { "epoch": 0.46944, "grad_norm": 1.9375, "grad_norm_var": 0.013923136393229167, "learning_rate": 0.0001, "loss": 3.9302, "loss/crossentropy": 1.8116675019264221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19272665679454803, "step": 23472 }, { "epoch": 0.46948, "grad_norm": 2.015625, "grad_norm_var": 0.015433502197265626, "learning_rate": 0.0001, "loss": 4.0605, "loss/crossentropy": 2.378050446510315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21410302817821503, "step": 23474 }, { "epoch": 0.46952, "grad_norm": 2.015625, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.2755852937698364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20388127863407135, "step": 23476 }, { "epoch": 0.46956, "grad_norm": 1.9296875, "grad_norm_var": 0.005692545572916667, "learning_rate": 0.0001, "loss": 4.0909, "loss/crossentropy": 2.1453245282173157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20442672073841095, "step": 23478 }, { "epoch": 0.4696, "grad_norm": 1.9375, "grad_norm_var": 0.003543853759765625, "learning_rate": 0.0001, "loss": 3.7373, "loss/crossentropy": 2.0084391236305237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18768452107906342, "step": 23480 }, { "epoch": 0.46964, "grad_norm": 1.96875, "grad_norm_var": 0.002872467041015625, "learning_rate": 0.0001, "loss": 3.9004, "loss/crossentropy": 1.9836713075637817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842040792107582, "step": 23482 }, { "epoch": 0.46968, "grad_norm": 2.015625, "grad_norm_var": 0.004923502604166667, "learning_rate": 0.0001, "loss": 3.8047, "loss/crossentropy": 1.8174407482147217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1779571697115898, "step": 23484 }, { "epoch": 0.46972, "grad_norm": 2.203125, "grad_norm_var": 0.010835774739583333, "learning_rate": 0.0001, "loss": 3.9017, "loss/crossentropy": 1.8971520066261292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19008546322584152, "step": 23486 }, { "epoch": 0.46976, "grad_norm": 2.0, "grad_norm_var": 0.011356608072916666, "learning_rate": 0.0001, "loss": 3.7841, "loss/crossentropy": 1.9949323534965515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878834068775177, "step": 23488 }, { "epoch": 0.4698, "grad_norm": 1.9453125, "grad_norm_var": 0.010212198893229166, "learning_rate": 0.0001, "loss": 4.0221, "loss/crossentropy": 1.7825579047203064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1911696270108223, "step": 23490 }, { "epoch": 0.46984, "grad_norm": 1.9609375, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 4.0892, "loss/crossentropy": 2.305335283279419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20759187638759613, "step": 23492 }, { "epoch": 0.46988, "grad_norm": 2.078125, "grad_norm_var": 0.2226715087890625, "learning_rate": 0.0001, "loss": 3.894, "loss/crossentropy": 2.0296601057052612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1965966895222664, "step": 23494 }, { "epoch": 0.46992, "grad_norm": 1.921875, "grad_norm_var": 0.22094319661458334, "learning_rate": 0.0001, "loss": 4.2948, "loss/crossentropy": 2.5238767862319946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21435904502868652, "step": 23496 }, { "epoch": 0.46996, "grad_norm": 1.9765625, "grad_norm_var": 0.21852213541666668, "learning_rate": 0.0001, "loss": 4.1532, "loss/crossentropy": 2.2085973024368286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983572095632553, "step": 23498 }, { "epoch": 0.47, "grad_norm": 1.921875, "grad_norm_var": 0.21304423014322918, "learning_rate": 0.0001, "loss": 4.2002, "loss/crossentropy": 2.257534384727478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23111815005540848, "step": 23500 }, { "epoch": 0.47004, "grad_norm": 1.8984375, "grad_norm_var": 0.21055094401041666, "learning_rate": 0.0001, "loss": 4.2338, "loss/crossentropy": 2.3113789558410645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043687105178833, "step": 23502 }, { "epoch": 0.47008, "grad_norm": 1.8359375, "grad_norm_var": 0.2143267313639323, "learning_rate": 0.0001, "loss": 4.0727, "loss/crossentropy": 2.2028011083602905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18799518048763275, "step": 23504 }, { "epoch": 0.47012, "grad_norm": 2.03125, "grad_norm_var": 0.21318257649739583, "learning_rate": 0.0001, "loss": 4.3289, "loss/crossentropy": 2.1186457872390747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014438509941101, "step": 23506 }, { "epoch": 0.47016, "grad_norm": 1.9921875, "grad_norm_var": 0.21224136352539064, "learning_rate": 0.0001, "loss": 3.9821, "loss/crossentropy": 2.0399728417396545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18888195604085922, "step": 23508 }, { "epoch": 0.4702, "grad_norm": 2.03125, "grad_norm_var": 0.011350250244140625, "learning_rate": 0.0001, "loss": 3.9898, "loss/crossentropy": 1.943885326385498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20364457368850708, "step": 23510 }, { "epoch": 0.47024, "grad_norm": 2.171875, "grad_norm_var": 0.011812082926432292, "learning_rate": 0.0001, "loss": 4.0646, "loss/crossentropy": 1.9351946115493774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18621300905942917, "step": 23512 }, { "epoch": 0.47028, "grad_norm": 1.8125, "grad_norm_var": 0.014351399739583333, "learning_rate": 0.0001, "loss": 3.873, "loss/crossentropy": 2.0112447142601013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884174421429634, "step": 23514 }, { "epoch": 0.47032, "grad_norm": 1.9453125, "grad_norm_var": 0.014227040608723958, "learning_rate": 0.0001, "loss": 3.9918, "loss/crossentropy": 2.164724826812744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21057604998350143, "step": 23516 }, { "epoch": 0.47036, "grad_norm": 1.9453125, "grad_norm_var": 0.0138092041015625, "learning_rate": 0.0001, "loss": 4.2016, "loss/crossentropy": 2.198709011077881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20399244129657745, "step": 23518 }, { "epoch": 0.4704, "grad_norm": 1.8828125, "grad_norm_var": 0.014792633056640626, "learning_rate": 0.0001, "loss": 3.8945, "loss/crossentropy": 2.3152371644973755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19689901918172836, "step": 23520 }, { "epoch": 0.47044, "grad_norm": 1.90625, "grad_norm_var": 0.015242258707682291, "learning_rate": 0.0001, "loss": 3.9598, "loss/crossentropy": 1.8530511260032654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19973145425319672, "step": 23522 }, { "epoch": 0.47048, "grad_norm": 1.890625, "grad_norm_var": 0.010894521077473959, "learning_rate": 0.0001, "loss": 3.9725, "loss/crossentropy": 1.6506844758987427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17004889249801636, "step": 23524 }, { "epoch": 0.47052, "grad_norm": 1.9765625, "grad_norm_var": 0.00750732421875, "learning_rate": 0.0001, "loss": 4.0036, "loss/crossentropy": 1.7448241710662842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20921830832958221, "step": 23526 }, { "epoch": 0.47056, "grad_norm": 1.90625, "grad_norm_var": 0.0036944071451822918, "learning_rate": 0.0001, "loss": 3.7118, "loss/crossentropy": 1.7023364305496216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18442079424858093, "step": 23528 }, { "epoch": 0.4706, "grad_norm": 2.09375, "grad_norm_var": 0.005909983317057292, "learning_rate": 0.0001, "loss": 3.9572, "loss/crossentropy": 2.1476879119873047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205117367208004, "step": 23530 }, { "epoch": 0.47064, "grad_norm": 1.8671875, "grad_norm_var": 0.006154123942057292, "learning_rate": 0.0001, "loss": 4.1135, "loss/crossentropy": 1.9776412844657898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900978982448578, "step": 23532 }, { "epoch": 0.47068, "grad_norm": 1.8515625, "grad_norm_var": 0.009437815348307291, "learning_rate": 0.0001, "loss": 4.2694, "loss/crossentropy": 2.6421544551849365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21158932149410248, "step": 23534 }, { "epoch": 0.47072, "grad_norm": 2.375, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 4.4728, "loss/crossentropy": 2.3335670232772827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25099293887615204, "step": 23536 }, { "epoch": 0.47076, "grad_norm": 2.015625, "grad_norm_var": 0.02069091796875, "learning_rate": 0.0001, "loss": 4.2101, "loss/crossentropy": 2.309471607208252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20084770023822784, "step": 23538 }, { "epoch": 0.4708, "grad_norm": 1.96875, "grad_norm_var": 0.020799763997395835, "learning_rate": 0.0001, "loss": 3.6632, "loss/crossentropy": 1.794599175453186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17749344557523727, "step": 23540 }, { "epoch": 0.47084, "grad_norm": 2.109375, "grad_norm_var": 0.022440338134765626, "learning_rate": 0.0001, "loss": 4.3814, "loss/crossentropy": 2.0325594544410706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19963493198156357, "step": 23542 }, { "epoch": 0.47088, "grad_norm": 2.015625, "grad_norm_var": 0.018863677978515625, "learning_rate": 0.0001, "loss": 4.0274, "loss/crossentropy": 2.0868377089500427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20961833745241165, "step": 23544 }, { "epoch": 0.47092, "grad_norm": 1.7734375, "grad_norm_var": 0.02393366495768229, "learning_rate": 0.0001, "loss": 3.7205, "loss/crossentropy": 1.9807188510894775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892232447862625, "step": 23546 }, { "epoch": 0.47096, "grad_norm": 1.8828125, "grad_norm_var": 0.023872884114583333, "learning_rate": 0.0001, "loss": 4.2712, "loss/crossentropy": 2.292481303215027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22137918323278427, "step": 23548 }, { "epoch": 0.471, "grad_norm": 1.90625, "grad_norm_var": 0.022240193684895833, "learning_rate": 0.0001, "loss": 4.0832, "loss/crossentropy": 2.240318775177002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084667906165123, "step": 23550 }, { "epoch": 0.47104, "grad_norm": 1.7890625, "grad_norm_var": 0.019969685872395834, "learning_rate": 0.0001, "loss": 3.8889, "loss/crossentropy": 2.0380293130874634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2129639983177185, "step": 23552 }, { "epoch": 0.47108, "grad_norm": 1.8828125, "grad_norm_var": 0.01995849609375, "learning_rate": 0.0001, "loss": 3.9957, "loss/crossentropy": 2.296157479286194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214280404150486, "step": 23554 }, { "epoch": 0.47112, "grad_norm": 2.1875, "grad_norm_var": 0.022655995686848958, "learning_rate": 0.0001, "loss": 4.0575, "loss/crossentropy": 2.308936357498169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20904332399368286, "step": 23556 }, { "epoch": 0.47116, "grad_norm": 1.9609375, "grad_norm_var": 0.01942723592122396, "learning_rate": 0.0001, "loss": 3.8463, "loss/crossentropy": 1.935391128063202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19526035338640213, "step": 23558 }, { "epoch": 0.4712, "grad_norm": 1.8828125, "grad_norm_var": 0.019022369384765626, "learning_rate": 0.0001, "loss": 4.0567, "loss/crossentropy": 2.21670663356781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21146119385957718, "step": 23560 }, { "epoch": 0.47124, "grad_norm": 1.9296875, "grad_norm_var": 0.01649958292643229, "learning_rate": 0.0001, "loss": 3.808, "loss/crossentropy": 1.9948694705963135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18240909278392792, "step": 23562 }, { "epoch": 0.47128, "grad_norm": 1.984375, "grad_norm_var": 0.0165283203125, "learning_rate": 0.0001, "loss": 4.0053, "loss/crossentropy": 2.0435580015182495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20460917055606842, "step": 23564 }, { "epoch": 0.47132, "grad_norm": 1.828125, "grad_norm_var": 0.018381500244140626, "learning_rate": 0.0001, "loss": 3.4991, "loss/crossentropy": 2.0021498799324036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17788298428058624, "step": 23566 }, { "epoch": 0.47136, "grad_norm": 2.015625, "grad_norm_var": 0.008845774332682292, "learning_rate": 0.0001, "loss": 3.9934, "loss/crossentropy": 2.0119062066078186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1800990328192711, "step": 23568 }, { "epoch": 0.4714, "grad_norm": 1.8671875, "grad_norm_var": 0.008967081705729166, "learning_rate": 0.0001, "loss": 3.8167, "loss/crossentropy": 1.5521536469459534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15504960715770721, "step": 23570 }, { "epoch": 0.47144, "grad_norm": 2.015625, "grad_norm_var": 0.004400380452473958, "learning_rate": 0.0001, "loss": 3.954, "loss/crossentropy": 1.9227730631828308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20652085542678833, "step": 23572 }, { "epoch": 0.47148, "grad_norm": 1.9765625, "grad_norm_var": 0.006078084309895833, "learning_rate": 0.0001, "loss": 4.075, "loss/crossentropy": 2.0273211002349854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18889201432466507, "step": 23574 }, { "epoch": 0.47152, "grad_norm": 2.03125, "grad_norm_var": 0.006788889567057292, "learning_rate": 0.0001, "loss": 3.8103, "loss/crossentropy": 1.934963881969452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17783871293067932, "step": 23576 }, { "epoch": 0.47156, "grad_norm": 1.9921875, "grad_norm_var": 0.007020823160807292, "learning_rate": 0.0001, "loss": 3.8893, "loss/crossentropy": 1.9819161295890808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19646496325731277, "step": 23578 }, { "epoch": 0.4716, "grad_norm": 2.21875, "grad_norm_var": 0.014387003580729167, "learning_rate": 0.0001, "loss": 4.1545, "loss/crossentropy": 1.9830252528190613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19861895591020584, "step": 23580 }, { "epoch": 0.47164, "grad_norm": 2.125, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 4.1332, "loss/crossentropy": 1.9679479598999023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19719085842370987, "step": 23582 }, { "epoch": 0.47168, "grad_norm": 1.859375, "grad_norm_var": 0.0126708984375, "learning_rate": 0.0001, "loss": 3.9055, "loss/crossentropy": 1.8915583491325378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777575984597206, "step": 23584 }, { "epoch": 0.47172, "grad_norm": 1.9375, "grad_norm_var": 0.011533355712890625, "learning_rate": 0.0001, "loss": 4.0302, "loss/crossentropy": 2.126678943634033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21122275292873383, "step": 23586 }, { "epoch": 0.47176, "grad_norm": 1.859375, "grad_norm_var": 0.012109375, "learning_rate": 0.0001, "loss": 3.8424, "loss/crossentropy": 1.8739299774169922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19388508796691895, "step": 23588 }, { "epoch": 0.4718, "grad_norm": 1.8984375, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 4.057, "loss/crossentropy": 1.983488917350769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18411333858966827, "step": 23590 }, { "epoch": 0.47184, "grad_norm": 2.03125, "grad_norm_var": 0.016796875, "learning_rate": 0.0001, "loss": 4.4929, "loss/crossentropy": 2.127811551094055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1860193982720375, "step": 23592 }, { "epoch": 0.47188, "grad_norm": 2.3125, "grad_norm_var": 0.022696940104166667, "learning_rate": 0.0001, "loss": 4.0036, "loss/crossentropy": 1.8028001189231873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18699438124895096, "step": 23594 }, { "epoch": 0.47192, "grad_norm": 1.9609375, "grad_norm_var": 0.02093683878580729, "learning_rate": 0.0001, "loss": 4.0754, "loss/crossentropy": 2.0716105699539185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1935330480337143, "step": 23596 }, { "epoch": 0.47196, "grad_norm": 2.0, "grad_norm_var": 0.020845540364583335, "learning_rate": 0.0001, "loss": 3.8819, "loss/crossentropy": 2.0923121571540833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20516054332256317, "step": 23598 }, { "epoch": 0.472, "grad_norm": 1.8359375, "grad_norm_var": 0.020357259114583335, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 2.244894862174988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21077310293912888, "step": 23600 }, { "epoch": 0.47204, "grad_norm": 1.8828125, "grad_norm_var": 0.02054417928059896, "learning_rate": 0.0001, "loss": 3.9788, "loss/crossentropy": 1.9484725594520569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16783934831619263, "step": 23602 }, { "epoch": 0.47208, "grad_norm": 2.203125, "grad_norm_var": 0.19378026326497397, "learning_rate": 0.0001, "loss": 3.8165, "loss/crossentropy": 1.8027321100234985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21114902943372726, "step": 23604 }, { "epoch": 0.47212, "grad_norm": 2.015625, "grad_norm_var": 0.18945083618164063, "learning_rate": 0.0001, "loss": 3.8104, "loss/crossentropy": 1.7768787741661072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848941519856453, "step": 23606 }, { "epoch": 0.47216, "grad_norm": 2.125, "grad_norm_var": 0.19102274576822917, "learning_rate": 0.0001, "loss": 3.8457, "loss/crossentropy": 1.8786412477493286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18932975828647614, "step": 23608 }, { "epoch": 0.4722, "grad_norm": 1.828125, "grad_norm_var": 0.1960113525390625, "learning_rate": 0.0001, "loss": 3.9519, "loss/crossentropy": 2.0566998720169067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1795952171087265, "step": 23610 }, { "epoch": 0.47224, "grad_norm": 2.03125, "grad_norm_var": 0.19468154907226562, "learning_rate": 0.0001, "loss": 4.0255, "loss/crossentropy": 1.9085100293159485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19264912605285645, "step": 23612 }, { "epoch": 0.47228, "grad_norm": 1.8984375, "grad_norm_var": 0.19574381510416666, "learning_rate": 0.0001, "loss": 4.0038, "loss/crossentropy": 2.0626463294029236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947372406721115, "step": 23614 }, { "epoch": 0.47232, "grad_norm": 2.09375, "grad_norm_var": 0.1929278055826823, "learning_rate": 0.0001, "loss": 4.144, "loss/crossentropy": 2.0861737728118896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21039434522390366, "step": 23616 }, { "epoch": 0.47236, "grad_norm": 2.046875, "grad_norm_var": 0.19161783854166667, "learning_rate": 0.0001, "loss": 3.9677, "loss/crossentropy": 1.9024608135223389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18196185678243637, "step": 23618 }, { "epoch": 0.4724, "grad_norm": 1.9453125, "grad_norm_var": 0.006371815999348958, "learning_rate": 0.0001, "loss": 4.1103, "loss/crossentropy": 2.209709107875824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21473529189825058, "step": 23620 }, { "epoch": 0.47244, "grad_norm": 1.921875, "grad_norm_var": 0.0069010416666666664, "learning_rate": 0.0001, "loss": 3.9269, "loss/crossentropy": 2.1699984073638916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023090422153473, "step": 23622 }, { "epoch": 0.47248, "grad_norm": 1.875, "grad_norm_var": 0.006441243489583333, "learning_rate": 0.0001, "loss": 3.7869, "loss/crossentropy": 1.7027064561843872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16561312228441238, "step": 23624 }, { "epoch": 0.47252, "grad_norm": 2.109375, "grad_norm_var": 0.008526357014973958, "learning_rate": 0.0001, "loss": 3.9753, "loss/crossentropy": 2.2230480909347534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20173655450344086, "step": 23626 }, { "epoch": 0.47256, "grad_norm": 1.8203125, "grad_norm_var": 0.008841705322265626, "learning_rate": 0.0001, "loss": 3.6368, "loss/crossentropy": 1.9015594124794006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1708713322877884, "step": 23628 }, { "epoch": 0.4726, "grad_norm": 2.09375, "grad_norm_var": 0.010723622639973958, "learning_rate": 0.0001, "loss": 4.0118, "loss/crossentropy": 2.051177144050598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878596916794777, "step": 23630 }, { "epoch": 0.47264, "grad_norm": 1.8515625, "grad_norm_var": 0.013752237955729166, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 1.9841215014457703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841672658920288, "step": 23632 }, { "epoch": 0.47268, "grad_norm": 2.0625, "grad_norm_var": 0.0148590087890625, "learning_rate": 0.0001, "loss": 4.1022, "loss/crossentropy": 2.0642913579940796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22168366611003876, "step": 23634 }, { "epoch": 0.47272, "grad_norm": 1.9765625, "grad_norm_var": 0.014899698893229167, "learning_rate": 0.0001, "loss": 4.0448, "loss/crossentropy": 2.149936318397522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954735815525055, "step": 23636 }, { "epoch": 0.47276, "grad_norm": 1.9296875, "grad_norm_var": 0.014811197916666666, "learning_rate": 0.0001, "loss": 3.9646, "loss/crossentropy": 2.0555055141448975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19528509676456451, "step": 23638 }, { "epoch": 0.4728, "grad_norm": 1.8984375, "grad_norm_var": 0.013600413004557292, "learning_rate": 0.0001, "loss": 4.0768, "loss/crossentropy": 1.9836488366127014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17974190413951874, "step": 23640 }, { "epoch": 0.47284, "grad_norm": 2.21875, "grad_norm_var": 0.014644368489583334, "learning_rate": 0.0001, "loss": 4.4381, "loss/crossentropy": 2.455438733100891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21470309793949127, "step": 23642 }, { "epoch": 0.47288, "grad_norm": 1.90625, "grad_norm_var": 0.0148345947265625, "learning_rate": 0.0001, "loss": 3.9095, "loss/crossentropy": 2.0290130376815796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19266076385974884, "step": 23644 }, { "epoch": 0.47292, "grad_norm": 2.0625, "grad_norm_var": 0.014312489827473959, "learning_rate": 0.0001, "loss": 3.8732, "loss/crossentropy": 1.9230349659919739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17801092565059662, "step": 23646 }, { "epoch": 0.47296, "grad_norm": 2.1875, "grad_norm_var": 0.014452107747395833, "learning_rate": 0.0001, "loss": 4.1517, "loss/crossentropy": 2.015698790550232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20072294026613235, "step": 23648 }, { "epoch": 0.473, "grad_norm": 1.9140625, "grad_norm_var": 0.014357248942057291, "learning_rate": 0.0001, "loss": 3.8298, "loss/crossentropy": 1.9033147096633911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18670127540826797, "step": 23650 }, { "epoch": 0.47304, "grad_norm": 1.8359375, "grad_norm_var": 0.014906565348307291, "learning_rate": 0.0001, "loss": 3.7137, "loss/crossentropy": 1.7301848530769348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1701732650399208, "step": 23652 }, { "epoch": 0.47308, "grad_norm": 1.7734375, "grad_norm_var": 0.016747029622395833, "learning_rate": 0.0001, "loss": 3.7671, "loss/crossentropy": 2.0487022399902344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18679651618003845, "step": 23654 }, { "epoch": 0.47312, "grad_norm": 2.03125, "grad_norm_var": 0.017064412434895832, "learning_rate": 0.0001, "loss": 4.0564, "loss/crossentropy": 2.1262341737747192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21661578118801117, "step": 23656 }, { "epoch": 0.47316, "grad_norm": 1.9296875, "grad_norm_var": 0.011466217041015626, "learning_rate": 0.0001, "loss": 3.8656, "loss/crossentropy": 1.8145031929016113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787424087524414, "step": 23658 }, { "epoch": 0.4732, "grad_norm": 1.96875, "grad_norm_var": 0.0114013671875, "learning_rate": 0.0001, "loss": 3.9916, "loss/crossentropy": 2.1354995369911194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19884371012449265, "step": 23660 }, { "epoch": 0.47324, "grad_norm": 2.046875, "grad_norm_var": 0.010920206705729166, "learning_rate": 0.0001, "loss": 3.8533, "loss/crossentropy": 1.7008216381072998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1525547280907631, "step": 23662 }, { "epoch": 0.47328, "grad_norm": 2.0, "grad_norm_var": 0.006880442301432292, "learning_rate": 0.0001, "loss": 4.0684, "loss/crossentropy": 2.235612154006958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21687253564596176, "step": 23664 }, { "epoch": 0.47332, "grad_norm": 1.9921875, "grad_norm_var": 0.008819325764973959, "learning_rate": 0.0001, "loss": 4.3348, "loss/crossentropy": 2.375667631626129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21823541820049286, "step": 23666 }, { "epoch": 0.47336, "grad_norm": 1.9375, "grad_norm_var": 0.007995351155598959, "learning_rate": 0.0001, "loss": 4.2375, "loss/crossentropy": 2.223970353603363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21631266921758652, "step": 23668 }, { "epoch": 0.4734, "grad_norm": 1.7265625, "grad_norm_var": 0.009261067708333333, "learning_rate": 0.0001, "loss": 3.7413, "loss/crossentropy": 1.9337742328643799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18401625007390976, "step": 23670 }, { "epoch": 0.47344, "grad_norm": 1.921875, "grad_norm_var": 0.011400349934895833, "learning_rate": 0.0001, "loss": 4.1189, "loss/crossentropy": 2.1500974893569946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19867678731679916, "step": 23672 }, { "epoch": 0.47348, "grad_norm": 2.1875, "grad_norm_var": 0.015103912353515625, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 1.8808711171150208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18535293638706207, "step": 23674 }, { "epoch": 0.47352, "grad_norm": 2.0, "grad_norm_var": 0.0133544921875, "learning_rate": 0.0001, "loss": 4.1431, "loss/crossentropy": 2.1114797592163086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959507018327713, "step": 23676 }, { "epoch": 0.47356, "grad_norm": 2.09375, "grad_norm_var": 0.016141510009765624, "learning_rate": 0.0001, "loss": 3.8953, "loss/crossentropy": 2.2539754509925842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20711452513933182, "step": 23678 }, { "epoch": 0.4736, "grad_norm": 1.8671875, "grad_norm_var": 0.01567357381184896, "learning_rate": 0.0001, "loss": 3.9558, "loss/crossentropy": 1.950667679309845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18103761970996857, "step": 23680 }, { "epoch": 0.47364, "grad_norm": 1.8515625, "grad_norm_var": 0.015413157145182292, "learning_rate": 0.0001, "loss": 3.9297, "loss/crossentropy": 1.9214385747909546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19267746806144714, "step": 23682 }, { "epoch": 0.47368, "grad_norm": 1.7890625, "grad_norm_var": 0.0181304931640625, "learning_rate": 0.0001, "loss": 3.7439, "loss/crossentropy": 2.053990364074707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18003928661346436, "step": 23684 }, { "epoch": 0.47372, "grad_norm": 1.9609375, "grad_norm_var": 0.0151275634765625, "learning_rate": 0.0001, "loss": 3.951, "loss/crossentropy": 1.6660608649253845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15546631067991257, "step": 23686 }, { "epoch": 0.47376, "grad_norm": 1.8671875, "grad_norm_var": 0.013250478108723958, "learning_rate": 0.0001, "loss": 3.8348, "loss/crossentropy": 2.1377063989639282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202220119535923, "step": 23688 }, { "epoch": 0.4738, "grad_norm": 1.8203125, "grad_norm_var": 0.009293619791666667, "learning_rate": 0.0001, "loss": 3.8619, "loss/crossentropy": 1.9241121411323547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17836976051330566, "step": 23690 }, { "epoch": 0.47384, "grad_norm": 2.078125, "grad_norm_var": 0.009159088134765625, "learning_rate": 0.0001, "loss": 4.2613, "loss/crossentropy": 2.158058762550354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22955617308616638, "step": 23692 }, { "epoch": 0.47388, "grad_norm": 2.0, "grad_norm_var": 0.006870269775390625, "learning_rate": 0.0001, "loss": 4.24, "loss/crossentropy": 2.138846278190613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145211324095726, "step": 23694 }, { "epoch": 0.47392, "grad_norm": 1.9453125, "grad_norm_var": 0.006341298421223958, "learning_rate": 0.0001, "loss": 3.9193, "loss/crossentropy": 2.0770075917243958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009606510400772, "step": 23696 }, { "epoch": 0.47396, "grad_norm": 1.953125, "grad_norm_var": 0.006363932291666667, "learning_rate": 0.0001, "loss": 3.8684, "loss/crossentropy": 1.5739121437072754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16019296646118164, "step": 23698 }, { "epoch": 0.474, "grad_norm": 1.9453125, "grad_norm_var": 0.005078125, "learning_rate": 0.0001, "loss": 4.1262, "loss/crossentropy": 1.898674726486206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942930817604065, "step": 23700 }, { "epoch": 0.47404, "grad_norm": 1.9140625, "grad_norm_var": 0.004776763916015625, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 2.2228487730026245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093074843287468, "step": 23702 }, { "epoch": 0.47408, "grad_norm": 2.09375, "grad_norm_var": 0.006390126546223959, "learning_rate": 0.0001, "loss": 4.1041, "loss/crossentropy": 2.1111066341400146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19743914157152176, "step": 23704 }, { "epoch": 0.47412, "grad_norm": 3.359375, "grad_norm_var": 0.1302886962890625, "learning_rate": 0.0001, "loss": 4.0682, "loss/crossentropy": 1.780187964439392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18026559054851532, "step": 23706 }, { "epoch": 0.47416, "grad_norm": 2.03125, "grad_norm_var": 0.13217137654622396, "learning_rate": 0.0001, "loss": 3.8936, "loss/crossentropy": 1.983467936515808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188174769282341, "step": 23708 }, { "epoch": 0.4742, "grad_norm": 1.8828125, "grad_norm_var": 0.13393961588541667, "learning_rate": 0.0001, "loss": 4.0822, "loss/crossentropy": 2.0928043723106384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048778161406517, "step": 23710 }, { "epoch": 0.47424, "grad_norm": 1.8984375, "grad_norm_var": 0.13646240234375, "learning_rate": 0.0001, "loss": 4.0171, "loss/crossentropy": 1.8958263397216797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17733225226402283, "step": 23712 }, { "epoch": 0.47428, "grad_norm": 1.9765625, "grad_norm_var": 0.1341461181640625, "learning_rate": 0.0001, "loss": 4.0589, "loss/crossentropy": 1.8104780316352844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1762789711356163, "step": 23714 }, { "epoch": 0.47432, "grad_norm": 1.9375, "grad_norm_var": 0.1332415262858073, "learning_rate": 0.0001, "loss": 4.0282, "loss/crossentropy": 2.167409896850586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20198846608400345, "step": 23716 }, { "epoch": 0.47436, "grad_norm": 1.8984375, "grad_norm_var": 0.13642756144205728, "learning_rate": 0.0001, "loss": 3.9456, "loss/crossentropy": 1.8979802131652832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17944183200597763, "step": 23718 }, { "epoch": 0.4744, "grad_norm": 1.8984375, "grad_norm_var": 0.13585205078125, "learning_rate": 0.0001, "loss": 4.1639, "loss/crossentropy": 2.3538358211517334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084932029247284, "step": 23720 }, { "epoch": 0.47444, "grad_norm": 1.953125, "grad_norm_var": 0.005028279622395834, "learning_rate": 0.0001, "loss": 4.1931, "loss/crossentropy": 2.144857406616211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20259954035282135, "step": 23722 }, { "epoch": 0.47448, "grad_norm": 2.0625, "grad_norm_var": 0.0050771077473958336, "learning_rate": 0.0001, "loss": 4.1082, "loss/crossentropy": 2.3802285194396973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2301197499036789, "step": 23724 }, { "epoch": 0.47452, "grad_norm": 1.890625, "grad_norm_var": 0.006025950113932292, "learning_rate": 0.0001, "loss": 4.212, "loss/crossentropy": 2.5175833702087402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22316204756498337, "step": 23726 }, { "epoch": 0.47456, "grad_norm": 2.078125, "grad_norm_var": 0.006514485677083333, "learning_rate": 0.0001, "loss": 4.3959, "loss/crossentropy": 2.266839861869812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20391426980495453, "step": 23728 }, { "epoch": 0.4746, "grad_norm": 1.921875, "grad_norm_var": 0.006526438395182291, "learning_rate": 0.0001, "loss": 3.9426, "loss/crossentropy": 1.9993603229522705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19872139394283295, "step": 23730 }, { "epoch": 0.47464, "grad_norm": 1.8515625, "grad_norm_var": 0.007321929931640625, "learning_rate": 0.0001, "loss": 4.0236, "loss/crossentropy": 1.999217450618744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18402128666639328, "step": 23732 }, { "epoch": 0.47468, "grad_norm": 2.015625, "grad_norm_var": 0.006894683837890625, "learning_rate": 0.0001, "loss": 4.0597, "loss/crossentropy": 2.2356200218200684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19472184777259827, "step": 23734 }, { "epoch": 0.47472, "grad_norm": 1.8671875, "grad_norm_var": 0.0072062174479166664, "learning_rate": 0.0001, "loss": 3.9894, "loss/crossentropy": 1.6930708289146423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16877874732017517, "step": 23736 }, { "epoch": 0.47476, "grad_norm": 1.921875, "grad_norm_var": 0.007985178629557292, "learning_rate": 0.0001, "loss": 3.6418, "loss/crossentropy": 2.0427737832069397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20055241882801056, "step": 23738 }, { "epoch": 0.4748, "grad_norm": 2.0625, "grad_norm_var": 0.007795206705729167, "learning_rate": 0.0001, "loss": 4.0536, "loss/crossentropy": 2.11184823513031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1939689666032791, "step": 23740 }, { "epoch": 0.47484, "grad_norm": 1.7734375, "grad_norm_var": 0.008778635660807292, "learning_rate": 0.0001, "loss": 3.8553, "loss/crossentropy": 1.9833735823631287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16304892301559448, "step": 23742 }, { "epoch": 0.47488, "grad_norm": 1.859375, "grad_norm_var": 0.006754302978515625, "learning_rate": 0.0001, "loss": 4.1299, "loss/crossentropy": 2.0734696984291077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18283121287822723, "step": 23744 }, { "epoch": 0.47492, "grad_norm": 2.015625, "grad_norm_var": 0.00877685546875, "learning_rate": 0.0001, "loss": 4.0763, "loss/crossentropy": 1.7331766486167908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18056727200746536, "step": 23746 }, { "epoch": 0.47496, "grad_norm": 2.046875, "grad_norm_var": 0.0089752197265625, "learning_rate": 0.0001, "loss": 4.314, "loss/crossentropy": 2.224283456802368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21115867048501968, "step": 23748 }, { "epoch": 0.475, "grad_norm": 1.9140625, "grad_norm_var": 0.007734934488932292, "learning_rate": 0.0001, "loss": 3.9618, "loss/crossentropy": 1.6858720183372498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16034550219774246, "step": 23750 }, { "epoch": 0.47504, "grad_norm": 1.96875, "grad_norm_var": 0.008304850260416666, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.038428485393524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020621821284294, "step": 23752 }, { "epoch": 0.47508, "grad_norm": 1.859375, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 4.0219, "loss/crossentropy": 2.076684832572937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903630793094635, "step": 23754 }, { "epoch": 0.47512, "grad_norm": 1.8671875, "grad_norm_var": 0.007665761311848958, "learning_rate": 0.0001, "loss": 3.7804, "loss/crossentropy": 1.7917174100875854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1796514242887497, "step": 23756 }, { "epoch": 0.47516, "grad_norm": 2.046875, "grad_norm_var": 0.006859334309895834, "learning_rate": 0.0001, "loss": 4.0059, "loss/crossentropy": 1.9619473814964294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18760067224502563, "step": 23758 }, { "epoch": 0.4752, "grad_norm": 1.8828125, "grad_norm_var": 0.007990519205729166, "learning_rate": 0.0001, "loss": 3.787, "loss/crossentropy": 2.083077907562256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19316518306732178, "step": 23760 }, { "epoch": 0.47524, "grad_norm": 1.8828125, "grad_norm_var": 0.007100168863932292, "learning_rate": 0.0001, "loss": 3.8462, "loss/crossentropy": 1.8045600056648254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17447172850370407, "step": 23762 }, { "epoch": 0.47528, "grad_norm": 2.21875, "grad_norm_var": 0.013691965738932292, "learning_rate": 0.0001, "loss": 3.8125, "loss/crossentropy": 2.2159335613250732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20490044355392456, "step": 23764 }, { "epoch": 0.47532, "grad_norm": 1.796875, "grad_norm_var": 0.014851633707682292, "learning_rate": 0.0001, "loss": 3.7541, "loss/crossentropy": 2.0102903246879578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17014086246490479, "step": 23766 }, { "epoch": 0.47536, "grad_norm": 1.953125, "grad_norm_var": 0.012823232014973958, "learning_rate": 0.0001, "loss": 4.0154, "loss/crossentropy": 2.0524433851242065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20510738343000412, "step": 23768 }, { "epoch": 0.4754, "grad_norm": 1.921875, "grad_norm_var": 0.012849934895833333, "learning_rate": 0.0001, "loss": 3.8245, "loss/crossentropy": 1.9695302844047546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17639748752117157, "step": 23770 }, { "epoch": 0.47544, "grad_norm": 1.9921875, "grad_norm_var": 0.014642079671223959, "learning_rate": 0.0001, "loss": 4.4496, "loss/crossentropy": 2.3865933418273926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23802988976240158, "step": 23772 }, { "epoch": 0.47548, "grad_norm": 1.96875, "grad_norm_var": 0.013480631510416667, "learning_rate": 0.0001, "loss": 3.8708, "loss/crossentropy": 1.6841627955436707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837886944413185, "step": 23774 }, { "epoch": 0.47552, "grad_norm": 1.875, "grad_norm_var": 0.0132232666015625, "learning_rate": 0.0001, "loss": 3.9562, "loss/crossentropy": 1.9436610341072083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144036665558815, "step": 23776 }, { "epoch": 0.47556, "grad_norm": 2.03125, "grad_norm_var": 0.013846842447916667, "learning_rate": 0.0001, "loss": 3.9312, "loss/crossentropy": 1.8565305471420288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19765028357505798, "step": 23778 }, { "epoch": 0.4756, "grad_norm": 1.859375, "grad_norm_var": 0.007165273030598958, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 2.119917571544647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1896904706954956, "step": 23780 }, { "epoch": 0.47564, "grad_norm": 1.90625, "grad_norm_var": 0.008892567952473958, "learning_rate": 0.0001, "loss": 4.0573, "loss/crossentropy": 1.9241788387298584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17011697590351105, "step": 23782 }, { "epoch": 0.47568, "grad_norm": 1.875, "grad_norm_var": 0.010495758056640625, "learning_rate": 0.0001, "loss": 3.7454, "loss/crossentropy": 1.7912859320640564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17273171246051788, "step": 23784 }, { "epoch": 0.47572, "grad_norm": 1.9921875, "grad_norm_var": 0.0112945556640625, "learning_rate": 0.0001, "loss": 4.4069, "loss/crossentropy": 2.4223393201828003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194758951663971, "step": 23786 }, { "epoch": 0.47576, "grad_norm": 1.8984375, "grad_norm_var": 0.010416412353515625, "learning_rate": 0.0001, "loss": 3.7825, "loss/crossentropy": 1.9073758721351624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19918088614940643, "step": 23788 }, { "epoch": 0.4758, "grad_norm": 1.8203125, "grad_norm_var": 0.0115631103515625, "learning_rate": 0.0001, "loss": 3.4428, "loss/crossentropy": 1.5801246762275696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16602589190006256, "step": 23790 }, { "epoch": 0.47584, "grad_norm": 1.890625, "grad_norm_var": 0.010892740885416667, "learning_rate": 0.0001, "loss": 3.7883, "loss/crossentropy": 1.847874402999878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17299149185419083, "step": 23792 }, { "epoch": 0.47588, "grad_norm": 1.8046875, "grad_norm_var": 0.010431925455729166, "learning_rate": 0.0001, "loss": 3.9211, "loss/crossentropy": 2.272279739379883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19657636433839798, "step": 23794 }, { "epoch": 0.47592, "grad_norm": 2.109375, "grad_norm_var": 0.014387003580729167, "learning_rate": 0.0001, "loss": 4.422, "loss/crossentropy": 1.9107636213302612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1978672295808792, "step": 23796 }, { "epoch": 0.47596, "grad_norm": 1.9609375, "grad_norm_var": 0.011374664306640626, "learning_rate": 0.0001, "loss": 4.2113, "loss/crossentropy": 2.384741425514221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213532455265522, "step": 23798 }, { "epoch": 0.476, "grad_norm": 2.140625, "grad_norm_var": 0.019207509358723958, "learning_rate": 0.0001, "loss": 4.2722, "loss/crossentropy": 2.2380464673042297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984338015317917, "step": 23800 }, { "epoch": 0.47604, "grad_norm": 1.921875, "grad_norm_var": 0.019465891520182292, "learning_rate": 0.0001, "loss": 4.0274, "loss/crossentropy": 2.044179916381836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18904277682304382, "step": 23802 }, { "epoch": 0.47608, "grad_norm": 2.015625, "grad_norm_var": 0.019425455729166666, "learning_rate": 0.0001, "loss": 3.9933, "loss/crossentropy": 2.3498952388763428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2706408053636551, "step": 23804 }, { "epoch": 0.47612, "grad_norm": 1.96875, "grad_norm_var": 0.017710113525390626, "learning_rate": 0.0001, "loss": 3.7679, "loss/crossentropy": 2.2029510736465454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19871965795755386, "step": 23806 }, { "epoch": 0.47616, "grad_norm": 2.0, "grad_norm_var": 0.0169830322265625, "learning_rate": 0.0001, "loss": 3.9891, "loss/crossentropy": 1.7823997139930725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17446641623973846, "step": 23808 }, { "epoch": 0.4762, "grad_norm": 2.078125, "grad_norm_var": 0.016414388020833334, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 1.9810225367546082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1739773526787758, "step": 23810 }, { "epoch": 0.47624, "grad_norm": 1.8984375, "grad_norm_var": 0.015429433186848958, "learning_rate": 0.0001, "loss": 4.1135, "loss/crossentropy": 2.0867974758148193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936039850115776, "step": 23812 }, { "epoch": 0.47628, "grad_norm": 1.8046875, "grad_norm_var": 0.017752838134765626, "learning_rate": 0.0001, "loss": 3.7694, "loss/crossentropy": 1.88890939950943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18023589998483658, "step": 23814 }, { "epoch": 0.47632, "grad_norm": 1.859375, "grad_norm_var": 0.007838694254557292, "learning_rate": 0.0001, "loss": 3.7677, "loss/crossentropy": 2.1205832958221436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20733095705509186, "step": 23816 }, { "epoch": 0.47636, "grad_norm": 2.109375, "grad_norm_var": 0.009804026285807291, "learning_rate": 0.0001, "loss": 3.9361, "loss/crossentropy": 2.1970152854919434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084852010011673, "step": 23818 }, { "epoch": 0.4764, "grad_norm": 2.09375, "grad_norm_var": 0.011484527587890625, "learning_rate": 0.0001, "loss": 3.9324, "loss/crossentropy": 1.9947285056114197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.181013822555542, "step": 23820 }, { "epoch": 0.47644, "grad_norm": 1.875, "grad_norm_var": 0.011205037434895834, "learning_rate": 0.0001, "loss": 3.7626, "loss/crossentropy": 1.8331729769706726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18118757009506226, "step": 23822 }, { "epoch": 0.47648, "grad_norm": 1.875, "grad_norm_var": 0.0111083984375, "learning_rate": 0.0001, "loss": 4.0555, "loss/crossentropy": 2.191226840019226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202358178794384, "step": 23824 }, { "epoch": 0.47652, "grad_norm": 1.8671875, "grad_norm_var": 0.009348297119140625, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 1.9327014088630676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17946144193410873, "step": 23826 }, { "epoch": 0.47656, "grad_norm": 1.9296875, "grad_norm_var": 0.007482655843098958, "learning_rate": 0.0001, "loss": 3.8711, "loss/crossentropy": 2.2243237495422363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19515040516853333, "step": 23828 }, { "epoch": 0.4766, "grad_norm": 1.8984375, "grad_norm_var": 0.00675048828125, "learning_rate": 0.0001, "loss": 4.1158, "loss/crossentropy": 2.176102042198181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19702810794115067, "step": 23830 }, { "epoch": 0.47664, "grad_norm": 1.875, "grad_norm_var": 0.006197102864583333, "learning_rate": 0.0001, "loss": 4.0918, "loss/crossentropy": 2.1210728883743286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.222815603017807, "step": 23832 }, { "epoch": 0.47668, "grad_norm": 1.984375, "grad_norm_var": 0.004134114583333333, "learning_rate": 0.0001, "loss": 3.9675, "loss/crossentropy": 1.9496826529502869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17903053760528564, "step": 23834 }, { "epoch": 0.47672, "grad_norm": 1.890625, "grad_norm_var": 0.0017575581868489584, "learning_rate": 0.0001, "loss": 3.9558, "loss/crossentropy": 2.139560639858246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20509754866361618, "step": 23836 }, { "epoch": 0.47676, "grad_norm": 1.875, "grad_norm_var": 0.0016591389973958333, "learning_rate": 0.0001, "loss": 4.0937, "loss/crossentropy": 2.102014422416687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19706308841705322, "step": 23838 }, { "epoch": 0.4768, "grad_norm": 1.8515625, "grad_norm_var": 0.0017486572265625, "learning_rate": 0.0001, "loss": 3.8519, "loss/crossentropy": 1.9479581117630005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20007412880659103, "step": 23840 }, { "epoch": 0.47684, "grad_norm": 1.921875, "grad_norm_var": 0.003177897135416667, "learning_rate": 0.0001, "loss": 4.2284, "loss/crossentropy": 1.8896448016166687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19973865151405334, "step": 23842 }, { "epoch": 0.47688, "grad_norm": 1.8515625, "grad_norm_var": 0.0030995686848958332, "learning_rate": 0.0001, "loss": 4.0693, "loss/crossentropy": 2.057901084423065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19682063907384872, "step": 23844 }, { "epoch": 0.47692, "grad_norm": 2.0, "grad_norm_var": 0.007355753580729167, "learning_rate": 0.0001, "loss": 4.2305, "loss/crossentropy": 1.9882251024246216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18287301808595657, "step": 23846 }, { "epoch": 0.47696, "grad_norm": 1.953125, "grad_norm_var": 0.008421834309895833, "learning_rate": 0.0001, "loss": 4.1016, "loss/crossentropy": 1.9481773972511292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17307578027248383, "step": 23848 }, { "epoch": 0.477, "grad_norm": 1.828125, "grad_norm_var": 0.01718928019205729, "learning_rate": 0.0001, "loss": 3.8738, "loss/crossentropy": 1.8696550130844116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18704131245613098, "step": 23850 }, { "epoch": 0.47704, "grad_norm": 1.8671875, "grad_norm_var": 0.0187652587890625, "learning_rate": 0.0001, "loss": 3.8426, "loss/crossentropy": 2.0851826667785645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18556270003318787, "step": 23852 }, { "epoch": 0.47708, "grad_norm": 2.09375, "grad_norm_var": 0.018943023681640626, "learning_rate": 0.0001, "loss": 4.2501, "loss/crossentropy": 2.3411136865615845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.225515678524971, "step": 23854 }, { "epoch": 0.47712, "grad_norm": 1.9140625, "grad_norm_var": 0.01817804972330729, "learning_rate": 0.0001, "loss": 3.7964, "loss/crossentropy": 1.944790780544281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195308655500412, "step": 23856 }, { "epoch": 0.47716, "grad_norm": 2.328125, "grad_norm_var": 0.026594034830729165, "learning_rate": 0.0001, "loss": 4.004, "loss/crossentropy": 1.8672103881835938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18687407672405243, "step": 23858 }, { "epoch": 0.4772, "grad_norm": 1.9453125, "grad_norm_var": 0.026889801025390625, "learning_rate": 0.0001, "loss": 3.826, "loss/crossentropy": 1.9624406099319458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19766812026500702, "step": 23860 }, { "epoch": 0.47724, "grad_norm": 2.03125, "grad_norm_var": 0.026609039306640624, "learning_rate": 0.0001, "loss": 3.997, "loss/crossentropy": 2.1671148538589478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916585937142372, "step": 23862 }, { "epoch": 0.47728, "grad_norm": 1.921875, "grad_norm_var": 0.026668039957682292, "learning_rate": 0.0001, "loss": 4.0202, "loss/crossentropy": 1.8473829627037048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16666045784950256, "step": 23864 }, { "epoch": 0.47732, "grad_norm": 2.015625, "grad_norm_var": 0.018182118733723957, "learning_rate": 0.0001, "loss": 4.0035, "loss/crossentropy": 2.0888859033584595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19743109494447708, "step": 23866 }, { "epoch": 0.47736, "grad_norm": 1.8671875, "grad_norm_var": 0.01679051717122396, "learning_rate": 0.0001, "loss": 3.8915, "loss/crossentropy": 2.0148197412490845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103613168001175, "step": 23868 }, { "epoch": 0.4774, "grad_norm": 1.9375, "grad_norm_var": 0.015550740559895833, "learning_rate": 0.0001, "loss": 3.9269, "loss/crossentropy": 1.9780926704406738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18435538560152054, "step": 23870 }, { "epoch": 0.47744, "grad_norm": 2.109375, "grad_norm_var": 0.016068522135416666, "learning_rate": 0.0001, "loss": 3.9308, "loss/crossentropy": 2.026561677455902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19771291315555573, "step": 23872 }, { "epoch": 0.47748, "grad_norm": 1.90625, "grad_norm_var": 0.005606842041015625, "learning_rate": 0.0001, "loss": 3.9616, "loss/crossentropy": 2.029239237308502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19737554341554642, "step": 23874 }, { "epoch": 0.47752, "grad_norm": 1.8828125, "grad_norm_var": 0.005360666910807292, "learning_rate": 0.0001, "loss": 3.9622, "loss/crossentropy": 2.32661509513855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728177577257156, "step": 23876 }, { "epoch": 0.47756, "grad_norm": 1.8046875, "grad_norm_var": 0.005086008707682292, "learning_rate": 0.0001, "loss": 3.8965, "loss/crossentropy": 2.1897658109664917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915959268808365, "step": 23878 }, { "epoch": 0.4776, "grad_norm": 1.9765625, "grad_norm_var": 0.004889933268229166, "learning_rate": 0.0001, "loss": 4.1431, "loss/crossentropy": 2.3252129554748535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21492170542478561, "step": 23880 }, { "epoch": 0.47764, "grad_norm": 1.8125, "grad_norm_var": 0.005492146809895833, "learning_rate": 0.0001, "loss": 3.8796, "loss/crossentropy": 1.6142144203186035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1473793387413025, "step": 23882 }, { "epoch": 0.47768, "grad_norm": 2.0, "grad_norm_var": 0.005702463785807291, "learning_rate": 0.0001, "loss": 4.2673, "loss/crossentropy": 2.3027801513671875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21568017452955246, "step": 23884 }, { "epoch": 0.47772, "grad_norm": 2.046875, "grad_norm_var": 0.006791178385416667, "learning_rate": 0.0001, "loss": 3.9876, "loss/crossentropy": 1.7805312871932983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19376665353775024, "step": 23886 }, { "epoch": 0.47776, "grad_norm": 1.9140625, "grad_norm_var": 0.00482177734375, "learning_rate": 0.0001, "loss": 4.0035, "loss/crossentropy": 2.011800706386566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892775595188141, "step": 23888 }, { "epoch": 0.4778, "grad_norm": 1.8125, "grad_norm_var": 0.005649566650390625, "learning_rate": 0.0001, "loss": 4.0836, "loss/crossentropy": 2.257364869117737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2021905705332756, "step": 23890 }, { "epoch": 0.47784, "grad_norm": 1.9921875, "grad_norm_var": 0.005641428629557291, "learning_rate": 0.0001, "loss": 4.0778, "loss/crossentropy": 2.211492657661438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19960176944732666, "step": 23892 }, { "epoch": 0.47788, "grad_norm": 2.046875, "grad_norm_var": 0.006807454427083333, "learning_rate": 0.0001, "loss": 3.8243, "loss/crossentropy": 2.250791549682617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19187939167022705, "step": 23894 }, { "epoch": 0.47792, "grad_norm": 1.953125, "grad_norm_var": 0.0075927734375, "learning_rate": 0.0001, "loss": 3.9676, "loss/crossentropy": 1.8536748886108398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1737053319811821, "step": 23896 }, { "epoch": 0.47796, "grad_norm": 2.078125, "grad_norm_var": 0.014509073893229167, "learning_rate": 0.0001, "loss": 4.5734, "loss/crossentropy": 2.1692601442337036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19785165041685104, "step": 23898 }, { "epoch": 0.478, "grad_norm": 2.046875, "grad_norm_var": 0.014826456705729166, "learning_rate": 0.0001, "loss": 4.0647, "loss/crossentropy": 2.229991912841797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22324570268392563, "step": 23900 }, { "epoch": 0.47804, "grad_norm": 1.90625, "grad_norm_var": 0.014045206705729167, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 2.2393418550491333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19389375299215317, "step": 23902 }, { "epoch": 0.47808, "grad_norm": 2.015625, "grad_norm_var": 0.014009348551432292, "learning_rate": 0.0001, "loss": 4.1504, "loss/crossentropy": 2.072646141052246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844239741563797, "step": 23904 }, { "epoch": 0.47812, "grad_norm": 1.8046875, "grad_norm_var": 0.014202626546223958, "learning_rate": 0.0001, "loss": 4.1153, "loss/crossentropy": 2.3049235343933105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123921662569046, "step": 23906 }, { "epoch": 0.47816, "grad_norm": 1.84375, "grad_norm_var": 0.014964803059895834, "learning_rate": 0.0001, "loss": 3.7974, "loss/crossentropy": 1.9755294919013977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17440249770879745, "step": 23908 }, { "epoch": 0.4782, "grad_norm": 1.9609375, "grad_norm_var": 0.013402303059895834, "learning_rate": 0.0001, "loss": 3.8354, "loss/crossentropy": 1.8338207602500916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18320723623037338, "step": 23910 }, { "epoch": 0.47824, "grad_norm": 2.453125, "grad_norm_var": 0.026493326822916666, "learning_rate": 0.0001, "loss": 4.3086, "loss/crossentropy": 2.079473555088043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2535192593932152, "step": 23912 }, { "epoch": 0.47828, "grad_norm": 1.796875, "grad_norm_var": 0.023119099934895835, "learning_rate": 0.0001, "loss": 3.6006, "loss/crossentropy": 1.7429603934288025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17608244717121124, "step": 23914 }, { "epoch": 0.47832, "grad_norm": 1.875, "grad_norm_var": 0.023435211181640624, "learning_rate": 0.0001, "loss": 3.9687, "loss/crossentropy": 2.1863731145858765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18513252586126328, "step": 23916 }, { "epoch": 0.47836, "grad_norm": 1.8984375, "grad_norm_var": 0.0236083984375, "learning_rate": 0.0001, "loss": 3.948, "loss/crossentropy": 2.1431294679641724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22228983789682388, "step": 23918 }, { "epoch": 0.4784, "grad_norm": 1.921875, "grad_norm_var": 0.023827107747395833, "learning_rate": 0.0001, "loss": 3.9218, "loss/crossentropy": 1.8407437801361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16755413264036179, "step": 23920 }, { "epoch": 0.47844, "grad_norm": 2.140625, "grad_norm_var": 0.025229644775390626, "learning_rate": 0.0001, "loss": 4.2533, "loss/crossentropy": 2.0921709537506104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20164211839437485, "step": 23922 }, { "epoch": 0.47848, "grad_norm": 1.90625, "grad_norm_var": 0.04986979166666667, "learning_rate": 0.0001, "loss": 3.9155, "loss/crossentropy": 1.5495757460594177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741114780306816, "step": 23924 }, { "epoch": 0.47852, "grad_norm": 2.015625, "grad_norm_var": 0.050510406494140625, "learning_rate": 0.0001, "loss": 3.9084, "loss/crossentropy": 2.0054045915603638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18482672423124313, "step": 23926 }, { "epoch": 0.47856, "grad_norm": 3.015625, "grad_norm_var": 0.10667724609375, "learning_rate": 0.0001, "loss": 4.3504, "loss/crossentropy": 1.9925623536109924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19737054407596588, "step": 23928 }, { "epoch": 0.4786, "grad_norm": 2.09375, "grad_norm_var": 0.0996002197265625, "learning_rate": 0.0001, "loss": 4.1226, "loss/crossentropy": 1.9019532203674316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19580943882465363, "step": 23930 }, { "epoch": 0.47864, "grad_norm": 1.8828125, "grad_norm_var": 0.10124104817708333, "learning_rate": 0.0001, "loss": 3.969, "loss/crossentropy": 1.9665276408195496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18561270087957382, "step": 23932 }, { "epoch": 0.47868, "grad_norm": 2.046875, "grad_norm_var": 0.10370992024739584, "learning_rate": 0.0001, "loss": 4.3574, "loss/crossentropy": 2.390560507774353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.194176584482193, "step": 23934 }, { "epoch": 0.47872, "grad_norm": 2.046875, "grad_norm_var": 0.10062026977539062, "learning_rate": 0.0001, "loss": 4.2517, "loss/crossentropy": 2.1132951974868774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915990635752678, "step": 23936 }, { "epoch": 0.47876, "grad_norm": 1.9921875, "grad_norm_var": 0.1031158447265625, "learning_rate": 0.0001, "loss": 4.0203, "loss/crossentropy": 1.8993237018585205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18519867211580276, "step": 23938 }, { "epoch": 0.4788, "grad_norm": 2.0, "grad_norm_var": 0.08478190104166666, "learning_rate": 0.0001, "loss": 4.546, "loss/crossentropy": 2.1111881732940674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20509092509746552, "step": 23940 }, { "epoch": 0.47884, "grad_norm": 1.84375, "grad_norm_var": 0.08461812337239584, "learning_rate": 0.0001, "loss": 4.1067, "loss/crossentropy": 2.3452861309051514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19262470304965973, "step": 23942 }, { "epoch": 0.47888, "grad_norm": 1.859375, "grad_norm_var": 0.014733632405598959, "learning_rate": 0.0001, "loss": 3.8591, "loss/crossentropy": 2.3471847772598267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20432017743587494, "step": 23944 }, { "epoch": 0.47892, "grad_norm": 1.875, "grad_norm_var": 0.01253662109375, "learning_rate": 0.0001, "loss": 3.9816, "loss/crossentropy": 1.9720887541770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19780997186899185, "step": 23946 }, { "epoch": 0.47896, "grad_norm": 1.9765625, "grad_norm_var": 0.011124420166015624, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.311566114425659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22458520531654358, "step": 23948 }, { "epoch": 0.479, "grad_norm": 1.859375, "grad_norm_var": 0.011009724934895833, "learning_rate": 0.0001, "loss": 3.7254, "loss/crossentropy": 1.9993168711662292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1858225166797638, "step": 23950 }, { "epoch": 0.47904, "grad_norm": 1.9140625, "grad_norm_var": 0.010343170166015625, "learning_rate": 0.0001, "loss": 3.8156, "loss/crossentropy": 2.0514512062072754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19098126888275146, "step": 23952 }, { "epoch": 0.47908, "grad_norm": 2.140625, "grad_norm_var": 0.013248443603515625, "learning_rate": 0.0001, "loss": 3.6999, "loss/crossentropy": 1.6041250824928284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15559390932321548, "step": 23954 }, { "epoch": 0.47912, "grad_norm": 1.875, "grad_norm_var": 0.0064117431640625, "learning_rate": 0.0001, "loss": 3.946, "loss/crossentropy": 1.6981446146965027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16395021975040436, "step": 23956 }, { "epoch": 0.47916, "grad_norm": 1.875, "grad_norm_var": 0.007368977864583333, "learning_rate": 0.0001, "loss": 3.9571, "loss/crossentropy": 1.6894282102584839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18198157846927643, "step": 23958 }, { "epoch": 0.4792, "grad_norm": 2.875, "grad_norm_var": 0.06331278483072916, "learning_rate": 0.0001, "loss": 4.1842, "loss/crossentropy": 2.1354450583457947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053254321217537, "step": 23960 }, { "epoch": 0.47924, "grad_norm": 1.8125, "grad_norm_var": 0.06447652180989584, "learning_rate": 0.0001, "loss": 3.6665, "loss/crossentropy": 1.8701319098472595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1739087551832199, "step": 23962 }, { "epoch": 0.47928, "grad_norm": 1.984375, "grad_norm_var": 0.06447347005208333, "learning_rate": 0.0001, "loss": 4.0016, "loss/crossentropy": 2.1540639400482178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914234757423401, "step": 23964 }, { "epoch": 0.47932, "grad_norm": 2.0625, "grad_norm_var": 0.062170155843098956, "learning_rate": 0.0001, "loss": 3.9495, "loss/crossentropy": 1.6824876070022583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17504771798849106, "step": 23966 }, { "epoch": 0.47936, "grad_norm": 1.921875, "grad_norm_var": 0.06093114217122396, "learning_rate": 0.0001, "loss": 4.0193, "loss/crossentropy": 1.893419086933136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18423987925052643, "step": 23968 }, { "epoch": 0.4794, "grad_norm": 1.84375, "grad_norm_var": 0.0600738525390625, "learning_rate": 0.0001, "loss": 4.1371, "loss/crossentropy": 1.759018063545227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881185621023178, "step": 23970 }, { "epoch": 0.47944, "grad_norm": 2.21875, "grad_norm_var": 0.062263743082682295, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 2.1234898567199707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142435535788536, "step": 23972 }, { "epoch": 0.47948, "grad_norm": 1.8125, "grad_norm_var": 0.06409403483072916, "learning_rate": 0.0001, "loss": 3.9597, "loss/crossentropy": 2.260958671569824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19538094103336334, "step": 23974 }, { "epoch": 0.47952, "grad_norm": 1.8984375, "grad_norm_var": 0.0109039306640625, "learning_rate": 0.0001, "loss": 4.0813, "loss/crossentropy": 2.004380762577057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982579454779625, "step": 23976 }, { "epoch": 0.47956, "grad_norm": 1.7421875, "grad_norm_var": 0.012544759114583333, "learning_rate": 0.0001, "loss": 3.9318, "loss/crossentropy": 2.038970112800598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289468228816986, "step": 23978 }, { "epoch": 0.4796, "grad_norm": 1.8125, "grad_norm_var": 0.013361612955729166, "learning_rate": 0.0001, "loss": 4.0707, "loss/crossentropy": 2.291250705718994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19521041214466095, "step": 23980 }, { "epoch": 0.47964, "grad_norm": 2.078125, "grad_norm_var": 0.013537343343098958, "learning_rate": 0.0001, "loss": 4.2081, "loss/crossentropy": 2.03319388628006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981820985674858, "step": 23982 }, { "epoch": 0.47968, "grad_norm": 1.953125, "grad_norm_var": 0.017899322509765624, "learning_rate": 0.0001, "loss": 4.259, "loss/crossentropy": 2.3277642726898193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22071011364459991, "step": 23984 }, { "epoch": 0.47972, "grad_norm": 1.8046875, "grad_norm_var": 0.017724609375, "learning_rate": 0.0001, "loss": 4.0444, "loss/crossentropy": 2.2068026065826416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19094634801149368, "step": 23986 }, { "epoch": 0.47976, "grad_norm": 1.953125, "grad_norm_var": 0.012475331624348959, "learning_rate": 0.0001, "loss": 4.0603, "loss/crossentropy": 2.3219540119171143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21838166564702988, "step": 23988 }, { "epoch": 0.4798, "grad_norm": 1.8125, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 3.9701, "loss/crossentropy": 1.9068449139595032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19176269322633743, "step": 23990 }, { "epoch": 0.47984, "grad_norm": 1.9921875, "grad_norm_var": 0.01620661417643229, "learning_rate": 0.0001, "loss": 3.8448, "loss/crossentropy": 1.8598286509513855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17854613065719604, "step": 23992 }, { "epoch": 0.47988, "grad_norm": 1.9140625, "grad_norm_var": 0.013670857747395833, "learning_rate": 0.0001, "loss": 4.3274, "loss/crossentropy": 2.3834491968154907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21013744175434113, "step": 23994 }, { "epoch": 0.47992, "grad_norm": 1.7890625, "grad_norm_var": 0.014009602864583333, "learning_rate": 0.0001, "loss": 3.8976, "loss/crossentropy": 1.9524520635604858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17633382230997086, "step": 23996 }, { "epoch": 0.47996, "grad_norm": 1.8046875, "grad_norm_var": 0.0135162353515625, "learning_rate": 0.0001, "loss": 3.8024, "loss/crossentropy": 1.8082820773124695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17721424251794815, "step": 23998 }, { "epoch": 0.48, "grad_norm": 1.8203125, "grad_norm_var": 0.0076983133951822914, "learning_rate": 0.0001, "loss": 4.0181, "loss/crossentropy": 2.2108744382858276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21027880907058716, "step": 24000 }, { "epoch": 0.48004, "grad_norm": 1.921875, "grad_norm_var": 0.007533518473307291, "learning_rate": 0.0001, "loss": 4.0885, "loss/crossentropy": 2.097791016101837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928793340921402, "step": 24002 }, { "epoch": 0.48008, "grad_norm": 1.875, "grad_norm_var": 0.008429972330729167, "learning_rate": 0.0001, "loss": 4.1388, "loss/crossentropy": 2.2565219402313232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21726776659488678, "step": 24004 }, { "epoch": 0.48012, "grad_norm": 1.84375, "grad_norm_var": 0.007706451416015625, "learning_rate": 0.0001, "loss": 4.0097, "loss/crossentropy": 2.260462522506714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973881646990776, "step": 24006 }, { "epoch": 0.48016, "grad_norm": 1.9765625, "grad_norm_var": 0.007713826497395834, "learning_rate": 0.0001, "loss": 4.4655, "loss/crossentropy": 2.2929039001464844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22804804146289825, "step": 24008 }, { "epoch": 0.4802, "grad_norm": 1.890625, "grad_norm_var": 0.008017730712890626, "learning_rate": 0.0001, "loss": 3.8324, "loss/crossentropy": 2.1556389331817627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19846376031637192, "step": 24010 }, { "epoch": 0.48024, "grad_norm": 2.078125, "grad_norm_var": 0.008514149983723959, "learning_rate": 0.0001, "loss": 4.2922, "loss/crossentropy": 2.2187989950180054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21181489527225494, "step": 24012 }, { "epoch": 0.48028, "grad_norm": 1.890625, "grad_norm_var": 0.007230631510416667, "learning_rate": 0.0001, "loss": 3.7714, "loss/crossentropy": 1.7474397420883179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18641416728496552, "step": 24014 }, { "epoch": 0.48032, "grad_norm": 1.9609375, "grad_norm_var": 0.006388346354166667, "learning_rate": 0.0001, "loss": 4.2139, "loss/crossentropy": 2.216716170310974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20577985048294067, "step": 24016 }, { "epoch": 0.48036, "grad_norm": 1.8046875, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 3.9804, "loss/crossentropy": 2.004529654979706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19065643101930618, "step": 24018 }, { "epoch": 0.4804, "grad_norm": 2.0, "grad_norm_var": 0.007054646809895833, "learning_rate": 0.0001, "loss": 4.0908, "loss/crossentropy": 2.1064136028289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20194996893405914, "step": 24020 }, { "epoch": 0.48044, "grad_norm": 1.8125, "grad_norm_var": 0.008208974202473959, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 1.9716034531593323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18451768904924393, "step": 24022 }, { "epoch": 0.48048, "grad_norm": 2.03125, "grad_norm_var": 0.0073394775390625, "learning_rate": 0.0001, "loss": 4.2538, "loss/crossentropy": 1.9206894636154175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21303895115852356, "step": 24024 }, { "epoch": 0.48052, "grad_norm": 1.859375, "grad_norm_var": 0.00718994140625, "learning_rate": 0.0001, "loss": 4.0227, "loss/crossentropy": 2.2184818983078003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205161914229393, "step": 24026 }, { "epoch": 0.48056, "grad_norm": 2.21875, "grad_norm_var": 0.061962890625, "learning_rate": 0.0001, "loss": 4.0584, "loss/crossentropy": 1.6741513013839722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18154188245534897, "step": 24028 }, { "epoch": 0.4806, "grad_norm": 2.96875, "grad_norm_var": 0.11687418619791666, "learning_rate": 0.0001, "loss": 4.0782, "loss/crossentropy": 1.9249740839004517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18673353642225266, "step": 24030 }, { "epoch": 0.48064, "grad_norm": 2.0625, "grad_norm_var": 0.11699930826822917, "learning_rate": 0.0001, "loss": 3.9456, "loss/crossentropy": 1.7694995403289795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18005912005901337, "step": 24032 }, { "epoch": 0.48068, "grad_norm": 1.9375, "grad_norm_var": 0.11251220703125, "learning_rate": 0.0001, "loss": 4.0384, "loss/crossentropy": 1.797115981578827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15852996706962585, "step": 24034 }, { "epoch": 0.48072, "grad_norm": 1.8828125, "grad_norm_var": 0.11373062133789062, "learning_rate": 0.0001, "loss": 4.0262, "loss/crossentropy": 2.2001808881759644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20976591855287552, "step": 24036 }, { "epoch": 0.48076, "grad_norm": 1.984375, "grad_norm_var": 0.11187108357747395, "learning_rate": 0.0001, "loss": 4.1885, "loss/crossentropy": 2.24720299243927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19293447583913803, "step": 24038 }, { "epoch": 0.4808, "grad_norm": 1.84375, "grad_norm_var": 0.11860249837239584, "learning_rate": 0.0001, "loss": 3.8017, "loss/crossentropy": 2.0160406827926636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19429870694875717, "step": 24040 }, { "epoch": 0.48084, "grad_norm": 2.078125, "grad_norm_var": 0.11473770141601562, "learning_rate": 0.0001, "loss": 3.7577, "loss/crossentropy": 1.6300169229507446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16134048998355865, "step": 24042 }, { "epoch": 0.48088, "grad_norm": 1.8125, "grad_norm_var": 0.0718505859375, "learning_rate": 0.0001, "loss": 3.7797, "loss/crossentropy": 1.7445127367973328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16317147016525269, "step": 24044 }, { "epoch": 0.48092, "grad_norm": 2.15625, "grad_norm_var": 0.009417470296223958, "learning_rate": 0.0001, "loss": 3.6375, "loss/crossentropy": 1.8674694299697876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19713745266199112, "step": 24046 }, { "epoch": 0.48096, "grad_norm": 1.75, "grad_norm_var": 0.01416015625, "learning_rate": 0.0001, "loss": 3.709, "loss/crossentropy": 1.9868064522743225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17037389427423477, "step": 24048 }, { "epoch": 0.481, "grad_norm": 1.828125, "grad_norm_var": 0.02504450480143229, "learning_rate": 0.0001, "loss": 3.9139, "loss/crossentropy": 1.8376989960670471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17913921177387238, "step": 24050 }, { "epoch": 0.48104, "grad_norm": 1.8203125, "grad_norm_var": 0.02818781534830729, "learning_rate": 0.0001, "loss": 3.9877, "loss/crossentropy": 2.100782632827759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20613420754671097, "step": 24052 }, { "epoch": 0.48108, "grad_norm": 1.96875, "grad_norm_var": 0.033176422119140625, "learning_rate": 0.0001, "loss": 4.1303, "loss/crossentropy": 2.20209538936615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972164735198021, "step": 24054 }, { "epoch": 0.48112, "grad_norm": 1.75, "grad_norm_var": 0.0370361328125, "learning_rate": 0.0001, "loss": 4.1346, "loss/crossentropy": 2.3585156202316284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20321515947580338, "step": 24056 }, { "epoch": 0.48116, "grad_norm": 1.90625, "grad_norm_var": 0.03779678344726563, "learning_rate": 0.0001, "loss": 4.083, "loss/crossentropy": 1.8413453698158264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18250074982643127, "step": 24058 }, { "epoch": 0.4812, "grad_norm": 2.0, "grad_norm_var": 0.03765055338541667, "learning_rate": 0.0001, "loss": 3.9894, "loss/crossentropy": 1.975302815437317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21386513113975525, "step": 24060 }, { "epoch": 0.48124, "grad_norm": 1.90625, "grad_norm_var": 0.03631998697916667, "learning_rate": 0.0001, "loss": 4.0553, "loss/crossentropy": 2.029150128364563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18910515308380127, "step": 24062 }, { "epoch": 0.48128, "grad_norm": 1.8671875, "grad_norm_var": 0.031648508707682294, "learning_rate": 0.0001, "loss": 3.9889, "loss/crossentropy": 2.030007481575012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18395943939685822, "step": 24064 }, { "epoch": 0.48132, "grad_norm": 2.109375, "grad_norm_var": 0.026610310872395834, "learning_rate": 0.0001, "loss": 3.9652, "loss/crossentropy": 2.0372042059898376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20342208445072174, "step": 24066 }, { "epoch": 0.48136, "grad_norm": 1.8828125, "grad_norm_var": 0.0240386962890625, "learning_rate": 0.0001, "loss": 3.6604, "loss/crossentropy": 2.1543315649032593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886550337076187, "step": 24068 }, { "epoch": 0.4814, "grad_norm": 1.9921875, "grad_norm_var": 0.01649754842122396, "learning_rate": 0.0001, "loss": 3.8595, "loss/crossentropy": 2.125926434993744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22634702920913696, "step": 24070 }, { "epoch": 0.48144, "grad_norm": 1.9453125, "grad_norm_var": 0.009699503580729166, "learning_rate": 0.0001, "loss": 3.879, "loss/crossentropy": 2.057066559791565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19772379100322723, "step": 24072 }, { "epoch": 0.48148, "grad_norm": 1.9375, "grad_norm_var": 0.010151926676432292, "learning_rate": 0.0001, "loss": 3.8897, "loss/crossentropy": 1.6535254120826721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16015072911977768, "step": 24074 }, { "epoch": 0.48152, "grad_norm": 1.859375, "grad_norm_var": 0.009471638997395834, "learning_rate": 0.0001, "loss": 3.8221, "loss/crossentropy": 1.9959819912910461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19117611646652222, "step": 24076 }, { "epoch": 0.48156, "grad_norm": 1.9609375, "grad_norm_var": 0.008251698811848958, "learning_rate": 0.0001, "loss": 3.9646, "loss/crossentropy": 1.8243607878684998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17835189402103424, "step": 24078 }, { "epoch": 0.4816, "grad_norm": 2.09375, "grad_norm_var": 0.009779612223307291, "learning_rate": 0.0001, "loss": 3.8842, "loss/crossentropy": 2.0300013422966003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20206968486309052, "step": 24080 }, { "epoch": 0.48164, "grad_norm": 1.984375, "grad_norm_var": 0.0050771077473958336, "learning_rate": 0.0001, "loss": 3.8706, "loss/crossentropy": 1.9207960963249207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195884570479393, "step": 24082 }, { "epoch": 0.48168, "grad_norm": 1.84375, "grad_norm_var": 0.004903157552083333, "learning_rate": 0.0001, "loss": 3.8254, "loss/crossentropy": 1.7780911922454834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16665860265493393, "step": 24084 }, { "epoch": 0.48172, "grad_norm": 1.953125, "grad_norm_var": 0.004811350504557292, "learning_rate": 0.0001, "loss": 4.0063, "loss/crossentropy": 2.028922200202942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19241196662187576, "step": 24086 }, { "epoch": 0.48176, "grad_norm": 1.8671875, "grad_norm_var": 0.004603830973307291, "learning_rate": 0.0001, "loss": 4.1312, "loss/crossentropy": 2.4450284242630005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077808380126953, "step": 24088 }, { "epoch": 0.4818, "grad_norm": 1.7578125, "grad_norm_var": 0.006769816080729167, "learning_rate": 0.0001, "loss": 4.1001, "loss/crossentropy": 2.0783454179763794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20609202980995178, "step": 24090 }, { "epoch": 0.48184, "grad_norm": 1.90625, "grad_norm_var": 0.006498209635416667, "learning_rate": 0.0001, "loss": 3.8235, "loss/crossentropy": 1.4834896326065063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1598825305700302, "step": 24092 }, { "epoch": 0.48188, "grad_norm": 2.25, "grad_norm_var": 0.013226064046223958, "learning_rate": 0.0001, "loss": 3.9492, "loss/crossentropy": 2.1462320685386658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19592846930027008, "step": 24094 }, { "epoch": 0.48192, "grad_norm": 1.96875, "grad_norm_var": 0.011735026041666667, "learning_rate": 0.0001, "loss": 4.1818, "loss/crossentropy": 2.197389841079712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19548462331295013, "step": 24096 }, { "epoch": 0.48196, "grad_norm": 1.953125, "grad_norm_var": 0.0119781494140625, "learning_rate": 0.0001, "loss": 4.0234, "loss/crossentropy": 2.1164830923080444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923857405781746, "step": 24098 }, { "epoch": 0.482, "grad_norm": 1.9609375, "grad_norm_var": 0.011234283447265625, "learning_rate": 0.0001, "loss": 3.9165, "loss/crossentropy": 2.2651044130325317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23053671419620514, "step": 24100 }, { "epoch": 0.48204, "grad_norm": 2.140625, "grad_norm_var": 0.013155110677083333, "learning_rate": 0.0001, "loss": 4.0934, "loss/crossentropy": 2.0696656107902527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19399120658636093, "step": 24102 }, { "epoch": 0.48208, "grad_norm": 1.984375, "grad_norm_var": 0.012741851806640624, "learning_rate": 0.0001, "loss": 3.9432, "loss/crossentropy": 2.0894209146499634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19454246014356613, "step": 24104 }, { "epoch": 0.48212, "grad_norm": 1.828125, "grad_norm_var": 0.011844889322916666, "learning_rate": 0.0001, "loss": 3.7906, "loss/crossentropy": 1.9821223020553589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19387958943843842, "step": 24106 }, { "epoch": 0.48216, "grad_norm": 1.921875, "grad_norm_var": 0.010949452718098959, "learning_rate": 0.0001, "loss": 4.2146, "loss/crossentropy": 2.172493100166321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20457924157381058, "step": 24108 }, { "epoch": 0.4822, "grad_norm": 1.7578125, "grad_norm_var": 0.009740956624348958, "learning_rate": 0.0001, "loss": 3.8386, "loss/crossentropy": 2.0507744550704956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1748208999633789, "step": 24110 }, { "epoch": 0.48224, "grad_norm": 1.859375, "grad_norm_var": 0.01253662109375, "learning_rate": 0.0001, "loss": 3.6145, "loss/crossentropy": 1.9859300255775452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1867031380534172, "step": 24112 }, { "epoch": 0.48228, "grad_norm": 2.125, "grad_norm_var": 0.0148193359375, "learning_rate": 0.0001, "loss": 4.089, "loss/crossentropy": 2.1157150268554688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208559550344944, "step": 24114 }, { "epoch": 0.48232, "grad_norm": 1.90625, "grad_norm_var": 0.015869140625, "learning_rate": 0.0001, "loss": 4.0411, "loss/crossentropy": 2.0446948409080505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21388032287359238, "step": 24116 }, { "epoch": 0.48236, "grad_norm": 1.9765625, "grad_norm_var": 0.0132232666015625, "learning_rate": 0.0001, "loss": 4.0558, "loss/crossentropy": 2.1047087907791138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18818341195583344, "step": 24118 }, { "epoch": 0.4824, "grad_norm": 2.03125, "grad_norm_var": 0.013605753580729166, "learning_rate": 0.0001, "loss": 4.2172, "loss/crossentropy": 2.235207200050354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22523081302642822, "step": 24120 }, { "epoch": 0.48244, "grad_norm": 1.8125, "grad_norm_var": 0.012068430582682291, "learning_rate": 0.0001, "loss": 4.0648, "loss/crossentropy": 2.4851995706558228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19662421196699142, "step": 24122 }, { "epoch": 0.48248, "grad_norm": 1.9296875, "grad_norm_var": 0.012980143229166666, "learning_rate": 0.0001, "loss": 3.9836, "loss/crossentropy": 1.8986297249794006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18801461160182953, "step": 24124 }, { "epoch": 0.48252, "grad_norm": 1.703125, "grad_norm_var": 0.0140045166015625, "learning_rate": 0.0001, "loss": 3.8972, "loss/crossentropy": 1.7900904417037964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16809770464897156, "step": 24126 }, { "epoch": 0.48256, "grad_norm": 1.78125, "grad_norm_var": 0.012914021809895834, "learning_rate": 0.0001, "loss": 3.8146, "loss/crossentropy": 2.057170867919922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19056524336338043, "step": 24128 }, { "epoch": 0.4826, "grad_norm": 1.9375, "grad_norm_var": 0.009600575764973958, "learning_rate": 0.0001, "loss": 4.0454, "loss/crossentropy": 2.0708820819854736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18739734590053558, "step": 24130 }, { "epoch": 0.48264, "grad_norm": 2.03125, "grad_norm_var": 0.009989166259765625, "learning_rate": 0.0001, "loss": 4.1356, "loss/crossentropy": 1.9330047965049744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18113382160663605, "step": 24132 }, { "epoch": 0.48268, "grad_norm": 2.0, "grad_norm_var": 0.010178375244140624, "learning_rate": 0.0001, "loss": 3.9655, "loss/crossentropy": 2.076119899749756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1978432536125183, "step": 24134 }, { "epoch": 0.48272, "grad_norm": 2.015625, "grad_norm_var": 0.010994211832682291, "learning_rate": 0.0001, "loss": 4.0518, "loss/crossentropy": 1.9721894264221191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19195114821195602, "step": 24136 }, { "epoch": 0.48276, "grad_norm": 1.90625, "grad_norm_var": 0.010514322916666667, "learning_rate": 0.0001, "loss": 4.0326, "loss/crossentropy": 2.2981468439102173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2356787919998169, "step": 24138 }, { "epoch": 0.4828, "grad_norm": 2.203125, "grad_norm_var": 0.014104970296223958, "learning_rate": 0.0001, "loss": 4.0693, "loss/crossentropy": 2.1552239656448364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20404700934886932, "step": 24140 }, { "epoch": 0.48284, "grad_norm": 1.96875, "grad_norm_var": 0.010379791259765625, "learning_rate": 0.0001, "loss": 4.0906, "loss/crossentropy": 2.1359177231788635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20384428650140762, "step": 24142 }, { "epoch": 0.48288, "grad_norm": 2.015625, "grad_norm_var": 0.0084381103515625, "learning_rate": 0.0001, "loss": 4.2397, "loss/crossentropy": 2.340220808982849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727048069238663, "step": 24144 }, { "epoch": 0.48292, "grad_norm": 1.875, "grad_norm_var": 0.009702301025390625, "learning_rate": 0.0001, "loss": 3.6822, "loss/crossentropy": 1.7174909114837646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16191110014915466, "step": 24146 }, { "epoch": 0.48296, "grad_norm": 1.7890625, "grad_norm_var": 0.010876210530598958, "learning_rate": 0.0001, "loss": 3.8731, "loss/crossentropy": 2.0727250576019287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182908833026886, "step": 24148 }, { "epoch": 0.483, "grad_norm": 2.125, "grad_norm_var": 0.0141357421875, "learning_rate": 0.0001, "loss": 4.2179, "loss/crossentropy": 2.073746085166931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22062638401985168, "step": 24150 }, { "epoch": 0.48304, "grad_norm": 4.59375, "grad_norm_var": 0.4433143615722656, "learning_rate": 0.0001, "loss": 3.8633, "loss/crossentropy": 1.7323468327522278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16554639488458633, "step": 24152 }, { "epoch": 0.48308, "grad_norm": 1.96875, "grad_norm_var": 0.4394365946451823, "learning_rate": 0.0001, "loss": 4.0216, "loss/crossentropy": 2.136180579662323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192221999168396, "step": 24154 }, { "epoch": 0.48312, "grad_norm": 1.953125, "grad_norm_var": 0.4390459696451823, "learning_rate": 0.0001, "loss": 4.0375, "loss/crossentropy": 1.8351938128471375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17663314193487167, "step": 24156 }, { "epoch": 0.48316, "grad_norm": 1.96875, "grad_norm_var": 0.44324722290039065, "learning_rate": 0.0001, "loss": 3.9927, "loss/crossentropy": 2.02448707818985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19494900107383728, "step": 24158 }, { "epoch": 0.4832, "grad_norm": 1.9140625, "grad_norm_var": 0.4480865478515625, "learning_rate": 0.0001, "loss": 3.9197, "loss/crossentropy": 2.185999810695648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869122013449669, "step": 24160 }, { "epoch": 0.48324, "grad_norm": 2.0625, "grad_norm_var": 0.4416168212890625, "learning_rate": 0.0001, "loss": 4.2421, "loss/crossentropy": 2.16790235042572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19200150668621063, "step": 24162 }, { "epoch": 0.48328, "grad_norm": 1.828125, "grad_norm_var": 0.44467137654622396, "learning_rate": 0.0001, "loss": 3.7496, "loss/crossentropy": 1.5323420763015747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14329544454813004, "step": 24164 }, { "epoch": 0.48332, "grad_norm": 1.96875, "grad_norm_var": 0.4490631103515625, "learning_rate": 0.0001, "loss": 4.1271, "loss/crossentropy": 2.4106760025024414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088209092617035, "step": 24166 }, { "epoch": 0.48336, "grad_norm": 1.90625, "grad_norm_var": 0.0195220947265625, "learning_rate": 0.0001, "loss": 4.1251, "loss/crossentropy": 2.2519407272338867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21344104409217834, "step": 24168 }, { "epoch": 0.4834, "grad_norm": 1.859375, "grad_norm_var": 0.016771443684895835, "learning_rate": 0.0001, "loss": 4.129, "loss/crossentropy": 2.4721094369888306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20769134163856506, "step": 24170 }, { "epoch": 0.48344, "grad_norm": 2.03125, "grad_norm_var": 0.01695734659830729, "learning_rate": 0.0001, "loss": 4.0163, "loss/crossentropy": 1.794982135295868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18507220596075058, "step": 24172 }, { "epoch": 0.48348, "grad_norm": 2.0, "grad_norm_var": 0.01649958292643229, "learning_rate": 0.0001, "loss": 4.0596, "loss/crossentropy": 2.086494565010071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993468999862671, "step": 24174 }, { "epoch": 0.48352, "grad_norm": 1.953125, "grad_norm_var": 0.016112263997395834, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 1.9496687054634094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19727850705385208, "step": 24176 }, { "epoch": 0.48356, "grad_norm": 1.9765625, "grad_norm_var": 0.005777740478515625, "learning_rate": 0.0001, "loss": 4.1632, "loss/crossentropy": 2.0036423206329346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1781500205397606, "step": 24178 }, { "epoch": 0.4836, "grad_norm": 2.0, "grad_norm_var": 0.013263956705729166, "learning_rate": 0.0001, "loss": 3.9156, "loss/crossentropy": 2.1094807386398315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822209656238556, "step": 24180 }, { "epoch": 0.48364, "grad_norm": 2.015625, "grad_norm_var": 0.013722483317057292, "learning_rate": 0.0001, "loss": 3.9997, "loss/crossentropy": 2.0273566246032715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21027785539627075, "step": 24182 }, { "epoch": 0.48368, "grad_norm": 1.8203125, "grad_norm_var": 0.014705149332682292, "learning_rate": 0.0001, "loss": 3.9781, "loss/crossentropy": 1.9223063588142395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821991503238678, "step": 24184 }, { "epoch": 0.48372, "grad_norm": 1.828125, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 4.0162, "loss/crossentropy": 1.7469323873519897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19376590102910995, "step": 24186 }, { "epoch": 0.48376, "grad_norm": 1.765625, "grad_norm_var": 0.017545572916666665, "learning_rate": 0.0001, "loss": 3.7141, "loss/crossentropy": 1.7850856184959412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17863896489143372, "step": 24188 }, { "epoch": 0.4838, "grad_norm": 1.9609375, "grad_norm_var": 0.01910985310872396, "learning_rate": 0.0001, "loss": 4.009, "loss/crossentropy": 1.7316967844963074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16793282330036163, "step": 24190 }, { "epoch": 0.48384, "grad_norm": 2.09375, "grad_norm_var": 0.02093480428059896, "learning_rate": 0.0001, "loss": 4.0243, "loss/crossentropy": 2.3408809900283813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21008720248937607, "step": 24192 }, { "epoch": 0.48388, "grad_norm": 1.8515625, "grad_norm_var": 0.02029393513997396, "learning_rate": 0.0001, "loss": 3.7861, "loss/crossentropy": 2.0060762763023376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1912834793329239, "step": 24194 }, { "epoch": 0.48392, "grad_norm": 1.9765625, "grad_norm_var": 0.011189778645833334, "learning_rate": 0.0001, "loss": 4.1224, "loss/crossentropy": 2.1604164838790894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1941056326031685, "step": 24196 }, { "epoch": 0.48396, "grad_norm": 1.96875, "grad_norm_var": 0.010371907552083334, "learning_rate": 0.0001, "loss": 4.2158, "loss/crossentropy": 2.0909000635147095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20140909403562546, "step": 24198 }, { "epoch": 0.484, "grad_norm": 1.9375, "grad_norm_var": 0.009039052327473958, "learning_rate": 0.0001, "loss": 3.94, "loss/crossentropy": 1.8877050876617432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18381226062774658, "step": 24200 }, { "epoch": 0.48404, "grad_norm": 1.90625, "grad_norm_var": 0.008721669514973959, "learning_rate": 0.0001, "loss": 3.9384, "loss/crossentropy": 1.982464611530304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18099818378686905, "step": 24202 }, { "epoch": 0.48408, "grad_norm": 1.9453125, "grad_norm_var": 0.006481679280598959, "learning_rate": 0.0001, "loss": 4.1044, "loss/crossentropy": 1.9599023461341858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18184300512075424, "step": 24204 }, { "epoch": 0.48412, "grad_norm": 1.9765625, "grad_norm_var": 0.005110422770182292, "learning_rate": 0.0001, "loss": 4.1949, "loss/crossentropy": 2.4100862741470337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156199961900711, "step": 24206 }, { "epoch": 0.48416, "grad_norm": 1.921875, "grad_norm_var": 0.0038653055826822916, "learning_rate": 0.0001, "loss": 4.4761, "loss/crossentropy": 2.2121779322624207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23106029629707336, "step": 24208 }, { "epoch": 0.4842, "grad_norm": 1.6875, "grad_norm_var": 0.007331339518229166, "learning_rate": 0.0001, "loss": 3.8548, "loss/crossentropy": 1.9747964143753052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17455366998910904, "step": 24210 }, { "epoch": 0.48424, "grad_norm": 1.9921875, "grad_norm_var": 0.010008748372395833, "learning_rate": 0.0001, "loss": 4.3493, "loss/crossentropy": 2.3463014364242554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21605894714593887, "step": 24212 }, { "epoch": 0.48428, "grad_norm": 2.15625, "grad_norm_var": 0.012626139322916667, "learning_rate": 0.0001, "loss": 4.1674, "loss/crossentropy": 2.1442391872406006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003452554345131, "step": 24214 }, { "epoch": 0.48432, "grad_norm": 1.890625, "grad_norm_var": 0.015803019205729168, "learning_rate": 0.0001, "loss": 4.2701, "loss/crossentropy": 2.281853437423706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21563008427619934, "step": 24216 }, { "epoch": 0.48436, "grad_norm": 1.984375, "grad_norm_var": 0.01668065388997396, "learning_rate": 0.0001, "loss": 4.3137, "loss/crossentropy": 2.303357243537903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23583707213401794, "step": 24218 }, { "epoch": 0.4844, "grad_norm": 1.953125, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 4.1768, "loss/crossentropy": 1.6764637231826782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1820378378033638, "step": 24220 }, { "epoch": 0.48444, "grad_norm": 1.921875, "grad_norm_var": 0.01702855428059896, "learning_rate": 0.0001, "loss": 4.4549, "loss/crossentropy": 2.3033594489097595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21953578293323517, "step": 24222 }, { "epoch": 0.48448, "grad_norm": 2.0, "grad_norm_var": 0.017354329427083332, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 2.1051379442214966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19566316902637482, "step": 24224 }, { "epoch": 0.48452, "grad_norm": 2.0625, "grad_norm_var": 0.009997304280598958, "learning_rate": 0.0001, "loss": 4.0489, "loss/crossentropy": 2.122231125831604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20219532400369644, "step": 24226 }, { "epoch": 0.48456, "grad_norm": 1.7265625, "grad_norm_var": 0.013974761962890625, "learning_rate": 0.0001, "loss": 3.8343, "loss/crossentropy": 1.6727787256240845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15022944658994675, "step": 24228 }, { "epoch": 0.4846, "grad_norm": 1.90625, "grad_norm_var": 0.012410227457682292, "learning_rate": 0.0001, "loss": 4.1034, "loss/crossentropy": 2.0252939462661743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014101818203926, "step": 24230 }, { "epoch": 0.48464, "grad_norm": 1.9375, "grad_norm_var": 0.011213175455729167, "learning_rate": 0.0001, "loss": 4.003, "loss/crossentropy": 2.163161873817444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20025929063558578, "step": 24232 }, { "epoch": 0.48468, "grad_norm": 1.75, "grad_norm_var": 0.010895792643229167, "learning_rate": 0.0001, "loss": 3.7811, "loss/crossentropy": 1.9188687205314636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916854828596115, "step": 24234 }, { "epoch": 0.48472, "grad_norm": 1.953125, "grad_norm_var": 0.009870402018229167, "learning_rate": 0.0001, "loss": 3.9804, "loss/crossentropy": 1.820820927619934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1769949272274971, "step": 24236 }, { "epoch": 0.48476, "grad_norm": 1.8984375, "grad_norm_var": 0.009297434488932292, "learning_rate": 0.0001, "loss": 3.9483, "loss/crossentropy": 2.1655235290527344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005934715270996, "step": 24238 }, { "epoch": 0.4848, "grad_norm": 1.9375, "grad_norm_var": 0.009810384114583333, "learning_rate": 0.0001, "loss": 4.0458, "loss/crossentropy": 1.9468095898628235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1830679327249527, "step": 24240 }, { "epoch": 0.48484, "grad_norm": 2.0, "grad_norm_var": 0.009535471598307291, "learning_rate": 0.0001, "loss": 4.0301, "loss/crossentropy": 2.062897562980652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2143079787492752, "step": 24242 }, { "epoch": 0.48488, "grad_norm": 2.171875, "grad_norm_var": 0.010389963785807291, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 2.0829350352287292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19584693014621735, "step": 24244 }, { "epoch": 0.48492, "grad_norm": 1.890625, "grad_norm_var": 0.0102447509765625, "learning_rate": 0.0001, "loss": 3.9724, "loss/crossentropy": 2.0563968420028687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20057959854602814, "step": 24246 }, { "epoch": 0.48496, "grad_norm": 2.171875, "grad_norm_var": 0.013396962483723959, "learning_rate": 0.0001, "loss": 4.2274, "loss/crossentropy": 2.1221665143966675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20288699865341187, "step": 24248 }, { "epoch": 0.485, "grad_norm": 1.921875, "grad_norm_var": 0.010374959309895833, "learning_rate": 0.0001, "loss": 4.2162, "loss/crossentropy": 2.1751914024353027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20034758746623993, "step": 24250 }, { "epoch": 0.48504, "grad_norm": 1.890625, "grad_norm_var": 0.010872395833333333, "learning_rate": 0.0001, "loss": 3.8777, "loss/crossentropy": 2.0292373299598694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18800581246614456, "step": 24252 }, { "epoch": 0.48508, "grad_norm": 4.03125, "grad_norm_var": 0.275439453125, "learning_rate": 0.0001, "loss": 4.0572, "loss/crossentropy": 2.3077808618545532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23775576055049896, "step": 24254 }, { "epoch": 0.48512, "grad_norm": 2.109375, "grad_norm_var": 0.27331441243489585, "learning_rate": 0.0001, "loss": 4.0708, "loss/crossentropy": 1.532023847103119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1606956273317337, "step": 24256 }, { "epoch": 0.48516, "grad_norm": 2.03125, "grad_norm_var": 0.26945699055989586, "learning_rate": 0.0001, "loss": 4.0351, "loss/crossentropy": 2.5032970905303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21631880104541779, "step": 24258 }, { "epoch": 0.4852, "grad_norm": 1.9296875, "grad_norm_var": 0.27319234212239585, "learning_rate": 0.0001, "loss": 3.9323, "loss/crossentropy": 1.981030285358429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19134312123060226, "step": 24260 }, { "epoch": 0.48524, "grad_norm": 1.953125, "grad_norm_var": 0.2728668212890625, "learning_rate": 0.0001, "loss": 3.7619, "loss/crossentropy": 2.031146466732025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17859568446874619, "step": 24262 }, { "epoch": 0.48528, "grad_norm": 1.8203125, "grad_norm_var": 0.2775917053222656, "learning_rate": 0.0001, "loss": 4.0923, "loss/crossentropy": 2.5168176889419556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21798798441886902, "step": 24264 }, { "epoch": 0.48532, "grad_norm": 4.09375, "grad_norm_var": 0.530810292561849, "learning_rate": 0.0001, "loss": 3.9951, "loss/crossentropy": 2.037827789783478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093580812215805, "step": 24266 }, { "epoch": 0.48536, "grad_norm": 1.9453125, "grad_norm_var": 0.5302874247233073, "learning_rate": 0.0001, "loss": 4.0575, "loss/crossentropy": 2.17299222946167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18525302410125732, "step": 24268 }, { "epoch": 0.4854, "grad_norm": 1.9296875, "grad_norm_var": 0.2958106994628906, "learning_rate": 0.0001, "loss": 3.9348, "loss/crossentropy": 2.147997260093689, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20097453892230988, "step": 24270 }, { "epoch": 0.48544, "grad_norm": 1.921875, "grad_norm_var": 0.298583984375, "learning_rate": 0.0001, "loss": 4.1208, "loss/crossentropy": 2.220878005027771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18916144967079163, "step": 24272 }, { "epoch": 0.48548, "grad_norm": 2.0625, "grad_norm_var": 0.297265625, "learning_rate": 0.0001, "loss": 4.0689, "loss/crossentropy": 2.3884263038635254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21515949070453644, "step": 24274 }, { "epoch": 0.48552, "grad_norm": 1.8984375, "grad_norm_var": 0.2965349833170573, "learning_rate": 0.0001, "loss": 4.091, "loss/crossentropy": 2.0658962726593018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19635243713855743, "step": 24276 }, { "epoch": 0.48556, "grad_norm": 1.8828125, "grad_norm_var": 0.297857411702474, "learning_rate": 0.0001, "loss": 4.2023, "loss/crossentropy": 2.3192285299301147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22074441611766815, "step": 24278 }, { "epoch": 0.4856, "grad_norm": 1.8671875, "grad_norm_var": 0.30143941243489586, "learning_rate": 0.0001, "loss": 3.8625, "loss/crossentropy": 2.082835614681244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18948380649089813, "step": 24280 }, { "epoch": 0.48564, "grad_norm": 2.046875, "grad_norm_var": 0.009039052327473958, "learning_rate": 0.0001, "loss": 3.8998, "loss/crossentropy": 1.7844191193580627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17573316395282745, "step": 24282 }, { "epoch": 0.48568, "grad_norm": 1.8828125, "grad_norm_var": 0.009161122639973958, "learning_rate": 0.0001, "loss": 3.8169, "loss/crossentropy": 1.9186521768569946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827767789363861, "step": 24284 }, { "epoch": 0.48572, "grad_norm": 2.296875, "grad_norm_var": 3.559098307291667, "learning_rate": 0.0001, "loss": 4.1294, "loss/crossentropy": 2.0445513129234314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22182191163301468, "step": 24286 }, { "epoch": 0.48576, "grad_norm": 2.140625, "grad_norm_var": 3.535992177327474, "learning_rate": 0.0001, "loss": 3.9131, "loss/crossentropy": 2.1303210258483887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20597820729017258, "step": 24288 }, { "epoch": 0.4858, "grad_norm": 1.96875, "grad_norm_var": 3.544976552327474, "learning_rate": 0.0001, "loss": 4.0554, "loss/crossentropy": 1.9035282135009766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17603804916143417, "step": 24290 }, { "epoch": 0.48584, "grad_norm": 2.109375, "grad_norm_var": 3.5285723368326822, "learning_rate": 0.0001, "loss": 4.0373, "loss/crossentropy": 2.101797640323639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19533278793096542, "step": 24292 }, { "epoch": 0.48588, "grad_norm": 1.875, "grad_norm_var": 3.5240071614583335, "learning_rate": 0.0001, "loss": 3.9883, "loss/crossentropy": 1.9595980048179626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17600257694721222, "step": 24294 }, { "epoch": 0.48592, "grad_norm": 1.84375, "grad_norm_var": 3.506517537434896, "learning_rate": 0.0001, "loss": 3.8312, "loss/crossentropy": 2.020717740058899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19629748165607452, "step": 24296 }, { "epoch": 0.48596, "grad_norm": 1.875, "grad_norm_var": 3.5107137044270833, "learning_rate": 0.0001, "loss": 4.4606, "loss/crossentropy": 2.113112688064575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987774595618248, "step": 24298 }, { "epoch": 0.486, "grad_norm": 1.984375, "grad_norm_var": 3.4833943684895834, "learning_rate": 0.0001, "loss": 4.3874, "loss/crossentropy": 2.3140907287597656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087773084640503, "step": 24300 }, { "epoch": 0.48604, "grad_norm": 2.0, "grad_norm_var": 0.012322743733723959, "learning_rate": 0.0001, "loss": 4.1962, "loss/crossentropy": 2.137213349342346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19433750957250595, "step": 24302 }, { "epoch": 0.48608, "grad_norm": 1.765625, "grad_norm_var": 0.015026601155598958, "learning_rate": 0.0001, "loss": 3.7098, "loss/crossentropy": 1.8572564125061035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18543125689029694, "step": 24304 }, { "epoch": 0.48612, "grad_norm": 1.7890625, "grad_norm_var": 0.017389933268229168, "learning_rate": 0.0001, "loss": 3.8251, "loss/crossentropy": 1.991189181804657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1831136792898178, "step": 24306 }, { "epoch": 0.48616, "grad_norm": 2.015625, "grad_norm_var": 0.016901652018229168, "learning_rate": 0.0001, "loss": 3.9742, "loss/crossentropy": 1.8540424704551697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1730886772274971, "step": 24308 }, { "epoch": 0.4862, "grad_norm": 1.875, "grad_norm_var": 0.016747792561848957, "learning_rate": 0.0001, "loss": 3.668, "loss/crossentropy": 2.0189526677131653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916787549853325, "step": 24310 }, { "epoch": 0.48624, "grad_norm": 1.9375, "grad_norm_var": 0.016071573893229166, "learning_rate": 0.0001, "loss": 4.0621, "loss/crossentropy": 2.2053693532943726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20035792887210846, "step": 24312 }, { "epoch": 0.48628, "grad_norm": 2.015625, "grad_norm_var": 0.012189737955729167, "learning_rate": 0.0001, "loss": 4.0676, "loss/crossentropy": 2.2726696729660034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21893906593322754, "step": 24314 }, { "epoch": 0.48632, "grad_norm": 1.890625, "grad_norm_var": 0.006940714518229167, "learning_rate": 0.0001, "loss": 3.6041, "loss/crossentropy": 1.9184311032295227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18804488331079483, "step": 24316 }, { "epoch": 0.48636, "grad_norm": 1.9921875, "grad_norm_var": 0.00714111328125, "learning_rate": 0.0001, "loss": 4.2603, "loss/crossentropy": 2.416733145713806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21675719320774078, "step": 24318 }, { "epoch": 0.4864, "grad_norm": 1.9765625, "grad_norm_var": 0.00570068359375, "learning_rate": 0.0001, "loss": 4.1108, "loss/crossentropy": 2.2064080238342285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20753097534179688, "step": 24320 }, { "epoch": 0.48644, "grad_norm": 2.015625, "grad_norm_var": 0.005324045817057292, "learning_rate": 0.0001, "loss": 4.1638, "loss/crossentropy": 1.9959591031074524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18936610966920853, "step": 24322 }, { "epoch": 0.48648, "grad_norm": 1.9140625, "grad_norm_var": 0.004613240559895833, "learning_rate": 0.0001, "loss": 4.0866, "loss/crossentropy": 2.14048969745636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17960359901189804, "step": 24324 }, { "epoch": 0.48652, "grad_norm": 2.109375, "grad_norm_var": 0.006845855712890625, "learning_rate": 0.0001, "loss": 4.1387, "loss/crossentropy": 1.6789081692695618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848226636648178, "step": 24326 }, { "epoch": 0.48656, "grad_norm": 2.0, "grad_norm_var": 0.0069620768229166664, "learning_rate": 0.0001, "loss": 4.2576, "loss/crossentropy": 2.4790940284729004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22105763852596283, "step": 24328 }, { "epoch": 0.4866, "grad_norm": 1.9296875, "grad_norm_var": 0.006981404622395834, "learning_rate": 0.0001, "loss": 4.1167, "loss/crossentropy": 1.8698575496673584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18414634466171265, "step": 24330 }, { "epoch": 0.48664, "grad_norm": 2.078125, "grad_norm_var": 0.0045969645182291664, "learning_rate": 0.0001, "loss": 4.1848, "loss/crossentropy": 2.1648661494255066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052561342716217, "step": 24332 }, { "epoch": 0.48668, "grad_norm": 1.8828125, "grad_norm_var": 0.0054107666015625, "learning_rate": 0.0001, "loss": 3.8961, "loss/crossentropy": 1.9554332494735718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2595558241009712, "step": 24334 }, { "epoch": 0.48672, "grad_norm": 1.8671875, "grad_norm_var": 0.005208079020182292, "learning_rate": 0.0001, "loss": 3.8727, "loss/crossentropy": 2.120553970336914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20485319197177887, "step": 24336 }, { "epoch": 0.48676, "grad_norm": 2.09375, "grad_norm_var": 0.006046295166015625, "learning_rate": 0.0001, "loss": 3.849, "loss/crossentropy": 1.7237398028373718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2036430537700653, "step": 24338 }, { "epoch": 0.4868, "grad_norm": 1.7734375, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 3.9799, "loss/crossentropy": 1.7713853120803833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1712682917714119, "step": 24340 }, { "epoch": 0.48684, "grad_norm": 1.828125, "grad_norm_var": 0.00877685546875, "learning_rate": 0.0001, "loss": 3.6067, "loss/crossentropy": 2.0136027932167053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18176132440567017, "step": 24342 }, { "epoch": 0.48688, "grad_norm": 1.984375, "grad_norm_var": 0.007478841145833333, "learning_rate": 0.0001, "loss": 4.0666, "loss/crossentropy": 1.9793068766593933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973177120089531, "step": 24344 }, { "epoch": 0.48692, "grad_norm": 2.140625, "grad_norm_var": 0.011165364583333334, "learning_rate": 0.0001, "loss": 4.3935, "loss/crossentropy": 2.4376041889190674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22222332656383514, "step": 24346 }, { "epoch": 0.48696, "grad_norm": 1.921875, "grad_norm_var": 0.010448201497395834, "learning_rate": 0.0001, "loss": 3.9815, "loss/crossentropy": 1.8432115316390991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915820688009262, "step": 24348 }, { "epoch": 0.487, "grad_norm": 1.9453125, "grad_norm_var": 0.010530344645182292, "learning_rate": 0.0001, "loss": 3.8947, "loss/crossentropy": 1.8100075125694275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18175233900547028, "step": 24350 }, { "epoch": 0.48704, "grad_norm": 2.015625, "grad_norm_var": 0.0118072509765625, "learning_rate": 0.0001, "loss": 4.0207, "loss/crossentropy": 2.0997090339660645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19269437342882156, "step": 24352 }, { "epoch": 0.48708, "grad_norm": 2.125, "grad_norm_var": 0.013529205322265625, "learning_rate": 0.0001, "loss": 3.9769, "loss/crossentropy": 1.78102046251297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18288961052894592, "step": 24354 }, { "epoch": 0.48712, "grad_norm": 1.8359375, "grad_norm_var": 0.012227121988932292, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 2.006900370121002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19866903871297836, "step": 24356 }, { "epoch": 0.48716, "grad_norm": 1.8046875, "grad_norm_var": 0.013036855061848958, "learning_rate": 0.0001, "loss": 3.7675, "loss/crossentropy": 1.9674765467643738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17953628301620483, "step": 24358 }, { "epoch": 0.4872, "grad_norm": 1.796875, "grad_norm_var": 0.014772288004557292, "learning_rate": 0.0001, "loss": 3.8801, "loss/crossentropy": 1.6254491806030273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16119451820850372, "step": 24360 }, { "epoch": 0.48724, "grad_norm": 1.8828125, "grad_norm_var": 0.010749308268229167, "learning_rate": 0.0001, "loss": 4.0639, "loss/crossentropy": 2.1395972967147827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20453596115112305, "step": 24362 }, { "epoch": 0.48728, "grad_norm": 2.0, "grad_norm_var": 0.011022694905598958, "learning_rate": 0.0001, "loss": 4.1345, "loss/crossentropy": 2.0489049553871155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140253186225891, "step": 24364 }, { "epoch": 0.48732, "grad_norm": 2.046875, "grad_norm_var": 0.011502838134765625, "learning_rate": 0.0001, "loss": 4.2531, "loss/crossentropy": 2.061949372291565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023036777973175, "step": 24366 }, { "epoch": 0.48736, "grad_norm": 1.96875, "grad_norm_var": 0.008658854166666667, "learning_rate": 0.0001, "loss": 4.0565, "loss/crossentropy": 2.2293535470962524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20210447907447815, "step": 24368 }, { "epoch": 0.4874, "grad_norm": 2.046875, "grad_norm_var": 0.006525675455729167, "learning_rate": 0.0001, "loss": 4.0853, "loss/crossentropy": 2.085119366645813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19112756848335266, "step": 24370 }, { "epoch": 0.48744, "grad_norm": 2.234375, "grad_norm_var": 0.012178548177083333, "learning_rate": 0.0001, "loss": 4.0025, "loss/crossentropy": 1.8672300577163696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19416838884353638, "step": 24372 }, { "epoch": 0.48748, "grad_norm": 1.7734375, "grad_norm_var": 0.015378570556640625, "learning_rate": 0.0001, "loss": 4.0043, "loss/crossentropy": 1.7550761699676514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23619835823774338, "step": 24374 }, { "epoch": 0.48752, "grad_norm": 1.75, "grad_norm_var": 0.017061360677083335, "learning_rate": 0.0001, "loss": 3.8778, "loss/crossentropy": 2.199945092201233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198745958507061, "step": 24376 }, { "epoch": 0.48756, "grad_norm": 1.8828125, "grad_norm_var": 0.01672948201497396, "learning_rate": 0.0001, "loss": 3.9917, "loss/crossentropy": 2.005652666091919, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20299483090639114, "step": 24378 }, { "epoch": 0.4876, "grad_norm": 1.9921875, "grad_norm_var": 0.01756769816080729, "learning_rate": 0.0001, "loss": 3.8204, "loss/crossentropy": 1.9981390237808228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19434207677841187, "step": 24380 }, { "epoch": 0.48764, "grad_norm": 1.9296875, "grad_norm_var": 0.017203776041666667, "learning_rate": 0.0001, "loss": 4.0997, "loss/crossentropy": 1.9847465753555298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016771137714386, "step": 24382 }, { "epoch": 0.48768, "grad_norm": 1.9296875, "grad_norm_var": 0.017268880208333334, "learning_rate": 0.0001, "loss": 3.9819, "loss/crossentropy": 1.9587423205375671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20353511720895767, "step": 24384 }, { "epoch": 0.48772, "grad_norm": 1.796875, "grad_norm_var": 0.017919921875, "learning_rate": 0.0001, "loss": 3.639, "loss/crossentropy": 1.6634628176689148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18179689347743988, "step": 24386 }, { "epoch": 0.48776, "grad_norm": 1.9609375, "grad_norm_var": 0.011164347330729166, "learning_rate": 0.0001, "loss": 4.0095, "loss/crossentropy": 1.8678483963012695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18797649443149567, "step": 24388 }, { "epoch": 0.4878, "grad_norm": 2.59375, "grad_norm_var": 0.034024810791015624, "learning_rate": 0.0001, "loss": 4.245, "loss/crossentropy": 2.2724742889404297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19732308387756348, "step": 24390 }, { "epoch": 0.48784, "grad_norm": 2.09375, "grad_norm_var": 0.030918121337890625, "learning_rate": 0.0001, "loss": 4.1437, "loss/crossentropy": 2.2534377574920654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21449754387140274, "step": 24392 }, { "epoch": 0.48788, "grad_norm": 2.03125, "grad_norm_var": 0.03006159464518229, "learning_rate": 0.0001, "loss": 4.4295, "loss/crossentropy": 2.3007233142852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21122832596302032, "step": 24394 }, { "epoch": 0.48792, "grad_norm": 1.6796875, "grad_norm_var": 0.036195627848307294, "learning_rate": 0.0001, "loss": 3.7791, "loss/crossentropy": 2.0957794189453125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19196099787950516, "step": 24396 }, { "epoch": 0.48796, "grad_norm": 1.9296875, "grad_norm_var": 0.03947118123372396, "learning_rate": 0.0001, "loss": 4.2137, "loss/crossentropy": 1.8763219118118286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841205433011055, "step": 24398 }, { "epoch": 0.488, "grad_norm": 1.9453125, "grad_norm_var": 0.03982747395833333, "learning_rate": 0.0001, "loss": 3.8093, "loss/crossentropy": 2.2737534046173096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200559563934803, "step": 24400 }, { "epoch": 0.48804, "grad_norm": 2.09375, "grad_norm_var": 0.03672866821289063, "learning_rate": 0.0001, "loss": 3.9472, "loss/crossentropy": 2.2588294744491577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21728847920894623, "step": 24402 }, { "epoch": 0.48808, "grad_norm": 2.078125, "grad_norm_var": 0.03683039347330729, "learning_rate": 0.0001, "loss": 4.052, "loss/crossentropy": 2.1669589281082153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2279760167002678, "step": 24404 }, { "epoch": 0.48812, "grad_norm": 1.9609375, "grad_norm_var": 0.014857737223307292, "learning_rate": 0.0001, "loss": 4.1939, "loss/crossentropy": 2.229967176914215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20311905443668365, "step": 24406 }, { "epoch": 0.48816, "grad_norm": 1.8046875, "grad_norm_var": 0.016242472330729167, "learning_rate": 0.0001, "loss": 3.8149, "loss/crossentropy": 1.7735026478767395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17489700764417648, "step": 24408 }, { "epoch": 0.4882, "grad_norm": 2.09375, "grad_norm_var": 0.016880035400390625, "learning_rate": 0.0001, "loss": 4.0679, "loss/crossentropy": 1.6055407524108887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16866382956504822, "step": 24410 }, { "epoch": 0.48824, "grad_norm": 2.265625, "grad_norm_var": 0.017826080322265625, "learning_rate": 0.0001, "loss": 4.1063, "loss/crossentropy": 2.0581588745117188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20155030488967896, "step": 24412 }, { "epoch": 0.48828, "grad_norm": 2.046875, "grad_norm_var": 0.022802734375, "learning_rate": 0.0001, "loss": 3.874, "loss/crossentropy": 2.0038405656814575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854727864265442, "step": 24414 }, { "epoch": 0.48832, "grad_norm": 2.0, "grad_norm_var": 0.0213134765625, "learning_rate": 0.0001, "loss": 4.4175, "loss/crossentropy": 2.5517786741256714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22493450343608856, "step": 24416 }, { "epoch": 0.48836, "grad_norm": 1.96875, "grad_norm_var": 0.021297200520833334, "learning_rate": 0.0001, "loss": 4.3037, "loss/crossentropy": 1.963084876537323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878989413380623, "step": 24418 }, { "epoch": 0.4884, "grad_norm": 2.28125, "grad_norm_var": 0.025023396809895834, "learning_rate": 0.0001, "loss": 4.3639, "loss/crossentropy": 2.093212604522705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20798558741807938, "step": 24420 }, { "epoch": 0.48844, "grad_norm": 1.859375, "grad_norm_var": 0.028537750244140625, "learning_rate": 0.0001, "loss": 3.9645, "loss/crossentropy": 2.2910980582237244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19769180566072464, "step": 24422 }, { "epoch": 0.48848, "grad_norm": 1.984375, "grad_norm_var": 0.02447509765625, "learning_rate": 0.0001, "loss": 4.0026, "loss/crossentropy": 1.7930738925933838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18997550755739212, "step": 24424 }, { "epoch": 0.48852, "grad_norm": 1.9765625, "grad_norm_var": 0.02550048828125, "learning_rate": 0.0001, "loss": 3.9086, "loss/crossentropy": 1.7596023678779602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17135074734687805, "step": 24426 }, { "epoch": 0.48856, "grad_norm": 1.9296875, "grad_norm_var": 0.019059244791666666, "learning_rate": 0.0001, "loss": 4.2215, "loss/crossentropy": 1.6692591905593872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1735754758119583, "step": 24428 }, { "epoch": 0.4886, "grad_norm": 2.28125, "grad_norm_var": 0.016015625, "learning_rate": 0.0001, "loss": 4.1244, "loss/crossentropy": 1.9515159130096436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20981048792600632, "step": 24430 }, { "epoch": 0.48864, "grad_norm": 1.7734375, "grad_norm_var": 0.019451649983723958, "learning_rate": 0.0001, "loss": 3.9243, "loss/crossentropy": 2.218432307243347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20877013355493546, "step": 24432 }, { "epoch": 0.48868, "grad_norm": 1.984375, "grad_norm_var": 0.01914240519205729, "learning_rate": 0.0001, "loss": 3.9418, "loss/crossentropy": 1.6877312660217285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17738080769777298, "step": 24434 }, { "epoch": 0.48872, "grad_norm": 1.90625, "grad_norm_var": 0.013427480061848959, "learning_rate": 0.0001, "loss": 4.1303, "loss/crossentropy": 2.204097032546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016325369477272, "step": 24436 }, { "epoch": 0.48876, "grad_norm": 1.8671875, "grad_norm_var": 0.012330881754557292, "learning_rate": 0.0001, "loss": 3.8735, "loss/crossentropy": 2.287162959575653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044740840792656, "step": 24438 }, { "epoch": 0.4888, "grad_norm": 1.8984375, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 4.0459, "loss/crossentropy": 2.0852694511413574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053818255662918, "step": 24440 }, { "epoch": 0.48884, "grad_norm": 1.8046875, "grad_norm_var": 0.017170206705729166, "learning_rate": 0.0001, "loss": 4.075, "loss/crossentropy": 2.070200562477112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872272714972496, "step": 24442 }, { "epoch": 0.48888, "grad_norm": 1.9765625, "grad_norm_var": 0.01678466796875, "learning_rate": 0.0001, "loss": 4.2843, "loss/crossentropy": 2.034148395061493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20018816739320755, "step": 24444 }, { "epoch": 0.48892, "grad_norm": 1.9140625, "grad_norm_var": 0.010282389322916667, "learning_rate": 0.0001, "loss": 3.9187, "loss/crossentropy": 2.2302210927009583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468773484230042, "step": 24446 }, { "epoch": 0.48896, "grad_norm": 1.8515625, "grad_norm_var": 0.009584299723307292, "learning_rate": 0.0001, "loss": 3.8918, "loss/crossentropy": 1.9789615273475647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18996840715408325, "step": 24448 }, { "epoch": 0.489, "grad_norm": 2.046875, "grad_norm_var": 0.0099761962890625, "learning_rate": 0.0001, "loss": 4.2603, "loss/crossentropy": 2.1200879216194153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20140299201011658, "step": 24450 }, { "epoch": 0.48904, "grad_norm": 1.734375, "grad_norm_var": 0.013475545247395833, "learning_rate": 0.0001, "loss": 3.6989, "loss/crossentropy": 1.9175571203231812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18238398432731628, "step": 24452 }, { "epoch": 0.48908, "grad_norm": 2.140625, "grad_norm_var": 0.016527303059895835, "learning_rate": 0.0001, "loss": 4.1204, "loss/crossentropy": 2.414350748062134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2311321198940277, "step": 24454 }, { "epoch": 0.48912, "grad_norm": 1.9453125, "grad_norm_var": 0.016267903645833335, "learning_rate": 0.0001, "loss": 4.0023, "loss/crossentropy": 2.113545060157776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183447748422623, "step": 24456 }, { "epoch": 0.48916, "grad_norm": 2.53125, "grad_norm_var": 0.035123443603515624, "learning_rate": 0.0001, "loss": 4.0608, "loss/crossentropy": 1.9957387447357178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.183005228638649, "step": 24458 }, { "epoch": 0.4892, "grad_norm": 1.9765625, "grad_norm_var": 0.034970855712890624, "learning_rate": 0.0001, "loss": 4.0566, "loss/crossentropy": 2.333039402961731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21214177459478378, "step": 24460 }, { "epoch": 0.48924, "grad_norm": 1.8671875, "grad_norm_var": 0.035302734375, "learning_rate": 0.0001, "loss": 3.876, "loss/crossentropy": 2.0721434950828552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012626901268959, "step": 24462 }, { "epoch": 0.48928, "grad_norm": 1.953125, "grad_norm_var": 0.0350006103515625, "learning_rate": 0.0001, "loss": 3.9045, "loss/crossentropy": 1.8990368843078613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19242222607135773, "step": 24464 }, { "epoch": 0.48932, "grad_norm": 1.9609375, "grad_norm_var": 0.0349029541015625, "learning_rate": 0.0001, "loss": 4.0894, "loss/crossentropy": 2.119647741317749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18154827505350113, "step": 24466 }, { "epoch": 0.48936, "grad_norm": 1.9921875, "grad_norm_var": 0.029679107666015624, "learning_rate": 0.0001, "loss": 3.9083, "loss/crossentropy": 2.0843252539634705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2219039350748062, "step": 24468 }, { "epoch": 0.4894, "grad_norm": 1.9140625, "grad_norm_var": 0.0286773681640625, "learning_rate": 0.0001, "loss": 3.979, "loss/crossentropy": 2.023647725582123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18391972035169601, "step": 24470 }, { "epoch": 0.48944, "grad_norm": 2.0, "grad_norm_var": 0.03695475260416667, "learning_rate": 0.0001, "loss": 4.2427, "loss/crossentropy": 2.1299458742141724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20914346724748611, "step": 24472 }, { "epoch": 0.48948, "grad_norm": 1.9296875, "grad_norm_var": 0.016690826416015624, "learning_rate": 0.0001, "loss": 3.734, "loss/crossentropy": 1.7355242371559143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1707276776432991, "step": 24474 }, { "epoch": 0.48952, "grad_norm": 2.1875, "grad_norm_var": 0.020888010660807293, "learning_rate": 0.0001, "loss": 3.9611, "loss/crossentropy": 1.7911944389343262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19432906806468964, "step": 24476 }, { "epoch": 0.48956, "grad_norm": 1.921875, "grad_norm_var": 0.0211090087890625, "learning_rate": 0.0001, "loss": 3.7178, "loss/crossentropy": 1.824588656425476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889827474951744, "step": 24478 }, { "epoch": 0.4896, "grad_norm": 1.875, "grad_norm_var": 0.020457967122395834, "learning_rate": 0.0001, "loss": 3.9154, "loss/crossentropy": 1.9896376132965088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033599317073822, "step": 24480 }, { "epoch": 0.48964, "grad_norm": 1.921875, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 4.0084, "loss/crossentropy": 2.1408406496047974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912234485149384, "step": 24482 }, { "epoch": 0.48968, "grad_norm": 1.796875, "grad_norm_var": 0.02083307902018229, "learning_rate": 0.0001, "loss": 4.0378, "loss/crossentropy": 2.083206295967102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20860081911087036, "step": 24484 }, { "epoch": 0.48972, "grad_norm": 1.9140625, "grad_norm_var": 0.02238947550455729, "learning_rate": 0.0001, "loss": 3.676, "loss/crossentropy": 1.706899344921112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741488128900528, "step": 24486 }, { "epoch": 0.48976, "grad_norm": 1.90625, "grad_norm_var": 0.011774698893229166, "learning_rate": 0.0001, "loss": 4.0392, "loss/crossentropy": 1.7981711030006409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1577363833785057, "step": 24488 }, { "epoch": 0.4898, "grad_norm": 2.015625, "grad_norm_var": 0.010716756184895834, "learning_rate": 0.0001, "loss": 4.0149, "loss/crossentropy": 2.3241711854934692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20463860034942627, "step": 24490 }, { "epoch": 0.48984, "grad_norm": 1.8515625, "grad_norm_var": 0.006810506184895833, "learning_rate": 0.0001, "loss": 3.8504, "loss/crossentropy": 1.8822330832481384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18153046071529388, "step": 24492 }, { "epoch": 0.48988, "grad_norm": 1.890625, "grad_norm_var": 0.006406402587890625, "learning_rate": 0.0001, "loss": 4.2485, "loss/crossentropy": 2.4287012815475464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21238531172275543, "step": 24494 }, { "epoch": 0.48992, "grad_norm": 1.828125, "grad_norm_var": 0.005783843994140625, "learning_rate": 0.0001, "loss": 3.898, "loss/crossentropy": 2.1026824712753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19364942610263824, "step": 24496 }, { "epoch": 0.48996, "grad_norm": 1.875, "grad_norm_var": 0.00570068359375, "learning_rate": 0.0001, "loss": 3.8818, "loss/crossentropy": 2.045943260192871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19146864116191864, "step": 24498 }, { "epoch": 0.49, "grad_norm": 1.9140625, "grad_norm_var": 0.004587554931640625, "learning_rate": 0.0001, "loss": 3.9354, "loss/crossentropy": 2.0596802830696106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18570110201835632, "step": 24500 }, { "epoch": 0.49004, "grad_norm": 2.015625, "grad_norm_var": 0.005020904541015625, "learning_rate": 0.0001, "loss": 4.1919, "loss/crossentropy": 2.0923487544059753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19211189448833466, "step": 24502 }, { "epoch": 0.49008, "grad_norm": 1.7578125, "grad_norm_var": 0.007564036051432291, "learning_rate": 0.0001, "loss": 3.8132, "loss/crossentropy": 2.2522445917129517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19882844388484955, "step": 24504 }, { "epoch": 0.49012, "grad_norm": 1.9140625, "grad_norm_var": 0.0066487630208333336, "learning_rate": 0.0001, "loss": 4.0853, "loss/crossentropy": 1.9081300497055054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18056107312440872, "step": 24506 }, { "epoch": 0.49016, "grad_norm": 2.015625, "grad_norm_var": 0.0062978108723958336, "learning_rate": 0.0001, "loss": 4.0405, "loss/crossentropy": 2.100565791130066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18689429759979248, "step": 24508 }, { "epoch": 0.4902, "grad_norm": 2.015625, "grad_norm_var": 0.007562001546223958, "learning_rate": 0.0001, "loss": 3.9123, "loss/crossentropy": 1.9120057821273804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958208829164505, "step": 24510 }, { "epoch": 0.49024, "grad_norm": 1.859375, "grad_norm_var": 0.008888498942057291, "learning_rate": 0.0001, "loss": 3.9378, "loss/crossentropy": 1.812345802783966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17729748785495758, "step": 24512 }, { "epoch": 0.49028, "grad_norm": 2.015625, "grad_norm_var": 0.009797922770182292, "learning_rate": 0.0001, "loss": 3.9652, "loss/crossentropy": 1.9347732067108154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207149900496006, "step": 24514 }, { "epoch": 0.49032, "grad_norm": 1.734375, "grad_norm_var": 0.011944325764973958, "learning_rate": 0.0001, "loss": 3.8159, "loss/crossentropy": 1.8915638327598572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16346803307533264, "step": 24516 }, { "epoch": 0.49036, "grad_norm": 1.7265625, "grad_norm_var": 0.011934407552083333, "learning_rate": 0.0001, "loss": 3.7906, "loss/crossentropy": 2.2307119369506836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19432727247476578, "step": 24518 }, { "epoch": 0.4904, "grad_norm": 2.1875, "grad_norm_var": 0.015240224202473958, "learning_rate": 0.0001, "loss": 4.012, "loss/crossentropy": 2.002243995666504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2070493996143341, "step": 24520 }, { "epoch": 0.49044, "grad_norm": 1.9140625, "grad_norm_var": 0.017210896809895834, "learning_rate": 0.0001, "loss": 3.8478, "loss/crossentropy": 2.166561722755432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18929924815893173, "step": 24522 }, { "epoch": 0.49048, "grad_norm": 2.03125, "grad_norm_var": 0.018619537353515625, "learning_rate": 0.0001, "loss": 4.1177, "loss/crossentropy": 2.1195461750030518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19457821547985077, "step": 24524 }, { "epoch": 0.49052, "grad_norm": 1.9921875, "grad_norm_var": 0.017878977457682292, "learning_rate": 0.0001, "loss": 4.195, "loss/crossentropy": 2.2752838134765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21283380687236786, "step": 24526 }, { "epoch": 0.49056, "grad_norm": 1.984375, "grad_norm_var": 0.015822092692057293, "learning_rate": 0.0001, "loss": 4.1894, "loss/crossentropy": 2.362849473953247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148214504122734, "step": 24528 }, { "epoch": 0.4906, "grad_norm": 1.8203125, "grad_norm_var": 0.01643651326497396, "learning_rate": 0.0001, "loss": 3.965, "loss/crossentropy": 1.9928752779960632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204628124833107, "step": 24530 }, { "epoch": 0.49064, "grad_norm": 1.90625, "grad_norm_var": 0.012206013997395833, "learning_rate": 0.0001, "loss": 4.1465, "loss/crossentropy": 2.2063074111938477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20204229652881622, "step": 24532 }, { "epoch": 0.49068, "grad_norm": 2.015625, "grad_norm_var": 0.008457183837890625, "learning_rate": 0.0001, "loss": 4.2816, "loss/crossentropy": 2.1019493341445923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18352027982473373, "step": 24534 }, { "epoch": 0.49072, "grad_norm": 1.8125, "grad_norm_var": 0.006949615478515625, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 1.9460110664367676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18813221156597137, "step": 24536 }, { "epoch": 0.49076, "grad_norm": 1.8125, "grad_norm_var": 0.006912994384765625, "learning_rate": 0.0001, "loss": 3.9377, "loss/crossentropy": 1.8915529251098633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198774054646492, "step": 24538 }, { "epoch": 0.4908, "grad_norm": 2.078125, "grad_norm_var": 0.0067860921223958336, "learning_rate": 0.0001, "loss": 4.2914, "loss/crossentropy": 2.0600665807724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067568153142929, "step": 24540 }, { "epoch": 0.49084, "grad_norm": 1.8046875, "grad_norm_var": 0.007347615559895834, "learning_rate": 0.0001, "loss": 3.8895, "loss/crossentropy": 2.3356127738952637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21175508201122284, "step": 24542 }, { "epoch": 0.49088, "grad_norm": 1.890625, "grad_norm_var": 0.006626129150390625, "learning_rate": 0.0001, "loss": 3.7529, "loss/crossentropy": 1.7713094353675842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16650734096765518, "step": 24544 }, { "epoch": 0.49092, "grad_norm": 2.03125, "grad_norm_var": 0.006514231363932292, "learning_rate": 0.0001, "loss": 4.1357, "loss/crossentropy": 2.307682991027832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20595747232437134, "step": 24546 }, { "epoch": 0.49096, "grad_norm": 1.8203125, "grad_norm_var": 0.0071044921875, "learning_rate": 0.0001, "loss": 3.7078, "loss/crossentropy": 2.0465540289878845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16928144544363022, "step": 24548 }, { "epoch": 0.491, "grad_norm": 1.859375, "grad_norm_var": 0.006422678629557292, "learning_rate": 0.0001, "loss": 3.7281, "loss/crossentropy": 1.6479852795600891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1531681939959526, "step": 24550 }, { "epoch": 0.49104, "grad_norm": 2.0625, "grad_norm_var": 0.007600911458333333, "learning_rate": 0.0001, "loss": 3.9933, "loss/crossentropy": 2.2218964099884033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1819760948419571, "step": 24552 }, { "epoch": 0.49108, "grad_norm": 1.75, "grad_norm_var": 0.009098307291666666, "learning_rate": 0.0001, "loss": 3.6829, "loss/crossentropy": 1.7314581274986267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1850220039486885, "step": 24554 }, { "epoch": 0.49112, "grad_norm": 1.953125, "grad_norm_var": 0.007059733072916667, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 2.1683263778686523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18814556300640106, "step": 24556 }, { "epoch": 0.49116, "grad_norm": 1.9921875, "grad_norm_var": 0.007098134358723958, "learning_rate": 0.0001, "loss": 4.2667, "loss/crossentropy": 2.364906072616577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20846372097730637, "step": 24558 }, { "epoch": 0.4912, "grad_norm": 1.8828125, "grad_norm_var": 0.007710520426432292, "learning_rate": 0.0001, "loss": 4.1047, "loss/crossentropy": 2.4396650791168213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21760626137256622, "step": 24560 }, { "epoch": 0.49124, "grad_norm": 2.375, "grad_norm_var": 0.059915924072265626, "learning_rate": 0.0001, "loss": 4.1424, "loss/crossentropy": 2.2420458793640137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19711492210626602, "step": 24562 }, { "epoch": 0.49128, "grad_norm": 1.84375, "grad_norm_var": 0.05835367838541667, "learning_rate": 0.0001, "loss": 3.8802, "loss/crossentropy": 1.8962340354919434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20642152428627014, "step": 24564 }, { "epoch": 0.49132, "grad_norm": 2.0, "grad_norm_var": 0.05555597941080729, "learning_rate": 0.0001, "loss": 3.9291, "loss/crossentropy": 2.024625241756439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574413985013962, "step": 24566 }, { "epoch": 0.49136, "grad_norm": 2.03125, "grad_norm_var": 0.05478515625, "learning_rate": 0.0001, "loss": 4.2441, "loss/crossentropy": 2.2790380716323853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19655412435531616, "step": 24568 }, { "epoch": 0.4914, "grad_norm": 1.96875, "grad_norm_var": 0.04713923136393229, "learning_rate": 0.0001, "loss": 4.0206, "loss/crossentropy": 2.1089435815811157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18229448795318604, "step": 24570 }, { "epoch": 0.49144, "grad_norm": 2.1875, "grad_norm_var": 0.04691162109375, "learning_rate": 0.0001, "loss": 4.2213, "loss/crossentropy": 2.242396593093872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1903126761317253, "step": 24572 }, { "epoch": 0.49148, "grad_norm": 1.8359375, "grad_norm_var": 0.0496002197265625, "learning_rate": 0.0001, "loss": 3.9656, "loss/crossentropy": 2.3436553478240967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20461376011371613, "step": 24574 }, { "epoch": 0.49152, "grad_norm": 2.375, "grad_norm_var": 0.05408910115559896, "learning_rate": 0.0001, "loss": 3.9294, "loss/crossentropy": 2.020145893096924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20228110253810883, "step": 24576 }, { "epoch": 0.49156, "grad_norm": 2.03125, "grad_norm_var": 0.01663386027018229, "learning_rate": 0.0001, "loss": 4.1186, "loss/crossentropy": 2.0371296405792236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19508864730596542, "step": 24578 }, { "epoch": 0.4916, "grad_norm": 2.03125, "grad_norm_var": 0.014544423421223958, "learning_rate": 0.0001, "loss": 4.2877, "loss/crossentropy": 2.078322470188141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19366976618766785, "step": 24580 }, { "epoch": 0.49164, "grad_norm": 2.015625, "grad_norm_var": 0.014349110921223958, "learning_rate": 0.0001, "loss": 4.222, "loss/crossentropy": 2.3686896562576294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069607138633728, "step": 24582 }, { "epoch": 0.49168, "grad_norm": 1.828125, "grad_norm_var": 0.01715876261393229, "learning_rate": 0.0001, "loss": 3.9882, "loss/crossentropy": 2.0332735180854797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21546462178230286, "step": 24584 }, { "epoch": 0.49172, "grad_norm": 1.984375, "grad_norm_var": 0.01779759724934896, "learning_rate": 0.0001, "loss": 4.1297, "loss/crossentropy": 2.0070220232009888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17601517587900162, "step": 24586 }, { "epoch": 0.49176, "grad_norm": 1.96875, "grad_norm_var": 0.014888254801432292, "learning_rate": 0.0001, "loss": 4.0979, "loss/crossentropy": 2.1559245586395264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19782397896051407, "step": 24588 }, { "epoch": 0.4918, "grad_norm": 1.859375, "grad_norm_var": 0.0160308837890625, "learning_rate": 0.0001, "loss": 3.8771, "loss/crossentropy": 2.173780679702759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20206089317798615, "step": 24590 }, { "epoch": 0.49184, "grad_norm": 1.9140625, "grad_norm_var": 0.008215077718098958, "learning_rate": 0.0001, "loss": 3.8911, "loss/crossentropy": 1.609294056892395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16182265430688858, "step": 24592 }, { "epoch": 0.49188, "grad_norm": 2.171875, "grad_norm_var": 0.08147557576497395, "learning_rate": 0.0001, "loss": 4.2665, "loss/crossentropy": 2.2137110233306885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27386993169784546, "step": 24594 }, { "epoch": 0.49192, "grad_norm": 2.015625, "grad_norm_var": 0.08635838826497395, "learning_rate": 0.0001, "loss": 3.7846, "loss/crossentropy": 1.9553529024124146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18842827528715134, "step": 24596 }, { "epoch": 0.49196, "grad_norm": 1.8203125, "grad_norm_var": 0.08915176391601562, "learning_rate": 0.0001, "loss": 4.0381, "loss/crossentropy": 1.8976882696151733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18534697592258453, "step": 24598 }, { "epoch": 0.492, "grad_norm": 2.25, "grad_norm_var": 0.09050064086914063, "learning_rate": 0.0001, "loss": 4.3902, "loss/crossentropy": 1.9944769740104675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20278682559728622, "step": 24600 }, { "epoch": 0.49204, "grad_norm": 1.765625, "grad_norm_var": 0.094580078125, "learning_rate": 0.0001, "loss": 3.8038, "loss/crossentropy": 2.3396860361099243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19831668585538864, "step": 24602 }, { "epoch": 0.49208, "grad_norm": 2.015625, "grad_norm_var": 0.09804280598958333, "learning_rate": 0.0001, "loss": 3.9544, "loss/crossentropy": 1.8764755725860596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19239525496959686, "step": 24604 }, { "epoch": 0.49212, "grad_norm": 2.015625, "grad_norm_var": 0.0951080322265625, "learning_rate": 0.0001, "loss": 4.0428, "loss/crossentropy": 1.9992756843566895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18613643199205399, "step": 24606 }, { "epoch": 0.49216, "grad_norm": 1.90625, "grad_norm_var": 0.09273681640625, "learning_rate": 0.0001, "loss": 4.1538, "loss/crossentropy": 2.0907077193260193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1898733228445053, "step": 24608 }, { "epoch": 0.4922, "grad_norm": 1.953125, "grad_norm_var": 0.017183430989583335, "learning_rate": 0.0001, "loss": 3.9356, "loss/crossentropy": 1.9387467503547668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077641263604164, "step": 24610 }, { "epoch": 0.49224, "grad_norm": 1.859375, "grad_norm_var": 0.015561676025390625, "learning_rate": 0.0001, "loss": 3.6729, "loss/crossentropy": 1.8329110741615295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17758768796920776, "step": 24612 }, { "epoch": 0.49228, "grad_norm": 1.9921875, "grad_norm_var": 0.01461181640625, "learning_rate": 0.0001, "loss": 3.9334, "loss/crossentropy": 1.8766810894012451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18651552498340607, "step": 24614 }, { "epoch": 0.49232, "grad_norm": 1.828125, "grad_norm_var": 0.009284464518229167, "learning_rate": 0.0001, "loss": 3.6679, "loss/crossentropy": 1.7456438541412354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16719914227724075, "step": 24616 }, { "epoch": 0.49236, "grad_norm": 1.890625, "grad_norm_var": 0.007920074462890624, "learning_rate": 0.0001, "loss": 4.0292, "loss/crossentropy": 2.0994815826416016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20065009593963623, "step": 24618 }, { "epoch": 0.4924, "grad_norm": 1.90625, "grad_norm_var": 0.007218170166015625, "learning_rate": 0.0001, "loss": 4.0588, "loss/crossentropy": 2.3052616119384766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2254004403948784, "step": 24620 }, { "epoch": 0.49244, "grad_norm": 1.9921875, "grad_norm_var": 0.006516265869140625, "learning_rate": 0.0001, "loss": 3.9487, "loss/crossentropy": 1.7560802102088928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17277851700782776, "step": 24622 }, { "epoch": 0.49248, "grad_norm": 1.9140625, "grad_norm_var": 0.007142893473307292, "learning_rate": 0.0001, "loss": 4.0868, "loss/crossentropy": 1.862656593322754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19756756722927094, "step": 24624 }, { "epoch": 0.49252, "grad_norm": 1.796875, "grad_norm_var": 0.006959788004557292, "learning_rate": 0.0001, "loss": 3.9022, "loss/crossentropy": 2.0374228954315186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19410685449838638, "step": 24626 }, { "epoch": 0.49256, "grad_norm": 1.90625, "grad_norm_var": 0.006315104166666667, "learning_rate": 0.0001, "loss": 3.9468, "loss/crossentropy": 2.2796911001205444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21644171327352524, "step": 24628 }, { "epoch": 0.4926, "grad_norm": 2.59375, "grad_norm_var": 0.03423436482747396, "learning_rate": 0.0001, "loss": 3.9822, "loss/crossentropy": 1.9718595743179321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18669331073760986, "step": 24630 }, { "epoch": 0.49264, "grad_norm": 1.9453125, "grad_norm_var": 0.030427042643229166, "learning_rate": 0.0001, "loss": 4.3174, "loss/crossentropy": 2.0071244835853577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994020640850067, "step": 24632 }, { "epoch": 0.49268, "grad_norm": 1.953125, "grad_norm_var": 0.029759724934895832, "learning_rate": 0.0001, "loss": 4.3168, "loss/crossentropy": 2.2078527212142944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190861478447914, "step": 24634 }, { "epoch": 0.49272, "grad_norm": 2.015625, "grad_norm_var": 0.0288482666015625, "learning_rate": 0.0001, "loss": 4.1319, "loss/crossentropy": 2.374260663986206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121455818414688, "step": 24636 }, { "epoch": 0.49276, "grad_norm": 2.046875, "grad_norm_var": 0.02938232421875, "learning_rate": 0.0001, "loss": 4.0958, "loss/crossentropy": 2.139094114303589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20233627408742905, "step": 24638 }, { "epoch": 0.4928, "grad_norm": 1.9921875, "grad_norm_var": 0.0288970947265625, "learning_rate": 0.0001, "loss": 4.1157, "loss/crossentropy": 1.9579105377197266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005467787384987, "step": 24640 }, { "epoch": 0.49284, "grad_norm": 2.03125, "grad_norm_var": 0.025199381510416667, "learning_rate": 0.0001, "loss": 4.3003, "loss/crossentropy": 2.122314691543579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005690187215805, "step": 24642 }, { "epoch": 0.49288, "grad_norm": 1.9921875, "grad_norm_var": 0.024094390869140624, "learning_rate": 0.0001, "loss": 4.3163, "loss/crossentropy": 1.9626818299293518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.196880042552948, "step": 24644 }, { "epoch": 0.49292, "grad_norm": 1.9609375, "grad_norm_var": 0.0026446024576822916, "learning_rate": 0.0001, "loss": 4.1957, "loss/crossentropy": 1.99526047706604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18154115974903107, "step": 24646 }, { "epoch": 0.49296, "grad_norm": 2.015625, "grad_norm_var": 0.0028310139973958333, "learning_rate": 0.0001, "loss": 4.1079, "loss/crossentropy": 2.2288442850112915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19869591295719147, "step": 24648 }, { "epoch": 0.493, "grad_norm": 1.9453125, "grad_norm_var": 0.002756500244140625, "learning_rate": 0.0001, "loss": 4.0832, "loss/crossentropy": 1.8555628061294556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18529972434043884, "step": 24650 }, { "epoch": 0.49304, "grad_norm": 1.9140625, "grad_norm_var": 0.0029042561848958335, "learning_rate": 0.0001, "loss": 4.0908, "loss/crossentropy": 2.1587546467781067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936108022928238, "step": 24652 }, { "epoch": 0.49308, "grad_norm": 1.8984375, "grad_norm_var": 0.0025469462076822915, "learning_rate": 0.0001, "loss": 4.2219, "loss/crossentropy": 1.8942645192146301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18769075721502304, "step": 24654 }, { "epoch": 0.49312, "grad_norm": 1.9609375, "grad_norm_var": 0.0025266011555989585, "learning_rate": 0.0001, "loss": 4.0054, "loss/crossentropy": 2.334362268447876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21441897749900818, "step": 24656 }, { "epoch": 0.49316, "grad_norm": 2.25, "grad_norm_var": 0.011063639322916667, "learning_rate": 0.0001, "loss": 3.945, "loss/crossentropy": 1.9287384748458862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2295522838830948, "step": 24658 }, { "epoch": 0.4932, "grad_norm": 2.015625, "grad_norm_var": 0.012111155192057292, "learning_rate": 0.0001, "loss": 4.1852, "loss/crossentropy": 2.212242364883423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138296589255333, "step": 24660 }, { "epoch": 0.49324, "grad_norm": 2.125, "grad_norm_var": 0.014831288655598959, "learning_rate": 0.0001, "loss": 3.9479, "loss/crossentropy": 2.17527437210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837603896856308, "step": 24662 }, { "epoch": 0.49328, "grad_norm": 1.875, "grad_norm_var": 0.01660741170247396, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 2.1768887042999268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20551389455795288, "step": 24664 }, { "epoch": 0.49332, "grad_norm": 1.984375, "grad_norm_var": 0.016646321614583334, "learning_rate": 0.0001, "loss": 3.9299, "loss/crossentropy": 1.987557053565979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999402493238449, "step": 24666 }, { "epoch": 0.49336, "grad_norm": 1.8046875, "grad_norm_var": 0.0241851806640625, "learning_rate": 0.0001, "loss": 3.5654, "loss/crossentropy": 1.7189387083053589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16583546251058578, "step": 24668 }, { "epoch": 0.4934, "grad_norm": 1.8515625, "grad_norm_var": 0.026021321614583332, "learning_rate": 0.0001, "loss": 3.9805, "loss/crossentropy": 2.0449089407920837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21923594176769257, "step": 24670 }, { "epoch": 0.49344, "grad_norm": 4.21875, "grad_norm_var": 0.34633763631184894, "learning_rate": 0.0001, "loss": 4.0541, "loss/crossentropy": 1.922966718673706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19492915272712708, "step": 24672 }, { "epoch": 0.49348, "grad_norm": 1.84375, "grad_norm_var": 0.34245198567708335, "learning_rate": 0.0001, "loss": 4.1132, "loss/crossentropy": 1.8697016835212708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21420104801654816, "step": 24674 }, { "epoch": 0.49352, "grad_norm": 2.0625, "grad_norm_var": 0.34196370442708335, "learning_rate": 0.0001, "loss": 4.2348, "loss/crossentropy": 2.2170732021331787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20673636347055435, "step": 24676 }, { "epoch": 0.49356, "grad_norm": 1.8984375, "grad_norm_var": 0.34133707682291664, "learning_rate": 0.0001, "loss": 3.7056, "loss/crossentropy": 2.0645321011543274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113899141550064, "step": 24678 }, { "epoch": 0.4936, "grad_norm": 1.859375, "grad_norm_var": 0.3439776102701823, "learning_rate": 0.0001, "loss": 4.0543, "loss/crossentropy": 2.0374088883399963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20146184414625168, "step": 24680 }, { "epoch": 0.49364, "grad_norm": 1.9140625, "grad_norm_var": 0.34632975260416665, "learning_rate": 0.0001, "loss": 4.0364, "loss/crossentropy": 1.9996783137321472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19109730422496796, "step": 24682 }, { "epoch": 0.49368, "grad_norm": 1.90625, "grad_norm_var": 0.329974110921224, "learning_rate": 0.0001, "loss": 4.1988, "loss/crossentropy": 2.2255775928497314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2078489512205124, "step": 24684 }, { "epoch": 0.49372, "grad_norm": 1.859375, "grad_norm_var": 0.3313385009765625, "learning_rate": 0.0001, "loss": 4.2153, "loss/crossentropy": 1.9860047698020935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21092459559440613, "step": 24686 }, { "epoch": 0.49376, "grad_norm": 1.984375, "grad_norm_var": 0.020475260416666665, "learning_rate": 0.0001, "loss": 4.0701, "loss/crossentropy": 1.9905331134796143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982455775141716, "step": 24688 }, { "epoch": 0.4938, "grad_norm": 1.9296875, "grad_norm_var": 0.012786610921223959, "learning_rate": 0.0001, "loss": 4.1729, "loss/crossentropy": 2.254474639892578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20714619755744934, "step": 24690 }, { "epoch": 0.49384, "grad_norm": 1.8984375, "grad_norm_var": 0.011376698811848959, "learning_rate": 0.0001, "loss": 3.8576, "loss/crossentropy": 2.264926791191101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1946386992931366, "step": 24692 }, { "epoch": 0.49388, "grad_norm": 1.8984375, "grad_norm_var": 0.01116943359375, "learning_rate": 0.0001, "loss": 3.847, "loss/crossentropy": 2.071332335472107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170526310801506, "step": 24694 }, { "epoch": 0.49392, "grad_norm": 1.8984375, "grad_norm_var": 0.010746002197265625, "learning_rate": 0.0001, "loss": 3.8254, "loss/crossentropy": 1.973964512348175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20519066601991653, "step": 24696 }, { "epoch": 0.49396, "grad_norm": 1.9296875, "grad_norm_var": 0.011644490559895833, "learning_rate": 0.0001, "loss": 3.848, "loss/crossentropy": 1.6099820137023926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15943622589111328, "step": 24698 }, { "epoch": 0.494, "grad_norm": 1.8828125, "grad_norm_var": 0.010542551676432291, "learning_rate": 0.0001, "loss": 4.1036, "loss/crossentropy": 2.1428415775299072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20248957723379135, "step": 24700 }, { "epoch": 0.49404, "grad_norm": 1.9765625, "grad_norm_var": 0.003742472330729167, "learning_rate": 0.0001, "loss": 4.3402, "loss/crossentropy": 2.3973593711853027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21721762418746948, "step": 24702 }, { "epoch": 0.49408, "grad_norm": 2.046875, "grad_norm_var": 0.004219563802083334, "learning_rate": 0.0001, "loss": 4.2233, "loss/crossentropy": 2.184257686138153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20263968408107758, "step": 24704 }, { "epoch": 0.49412, "grad_norm": 1.984375, "grad_norm_var": 0.0042111714680989586, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 1.953734815120697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18011564016342163, "step": 24706 }, { "epoch": 0.49416, "grad_norm": 1.9296875, "grad_norm_var": 0.00406494140625, "learning_rate": 0.0001, "loss": 4.0393, "loss/crossentropy": 2.0036890506744385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21608620882034302, "step": 24708 }, { "epoch": 0.4942, "grad_norm": 1.8359375, "grad_norm_var": 0.0047686258951822914, "learning_rate": 0.0001, "loss": 3.9851, "loss/crossentropy": 2.016548752784729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18857601284980774, "step": 24710 }, { "epoch": 0.49424, "grad_norm": 1.875, "grad_norm_var": 0.0053179423014322914, "learning_rate": 0.0001, "loss": 3.9084, "loss/crossentropy": 2.0665934085845947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18890216201543808, "step": 24712 }, { "epoch": 0.49428, "grad_norm": 1.8359375, "grad_norm_var": 0.005216471354166667, "learning_rate": 0.0001, "loss": 4.2576, "loss/crossentropy": 2.102554202079773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19079755991697311, "step": 24714 }, { "epoch": 0.49432, "grad_norm": 1.8125, "grad_norm_var": 0.007893625895182292, "learning_rate": 0.0001, "loss": 4.2018, "loss/crossentropy": 2.326322555541992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21783028542995453, "step": 24716 }, { "epoch": 0.49436, "grad_norm": 1.8359375, "grad_norm_var": 0.0073811848958333336, "learning_rate": 0.0001, "loss": 4.2362, "loss/crossentropy": 2.2604973316192627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19831713289022446, "step": 24718 }, { "epoch": 0.4944, "grad_norm": 1.9765625, "grad_norm_var": 0.00660400390625, "learning_rate": 0.0001, "loss": 3.859, "loss/crossentropy": 2.2417763471603394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19713596254587173, "step": 24720 }, { "epoch": 0.49444, "grad_norm": 1.9921875, "grad_norm_var": 0.009533437093098958, "learning_rate": 0.0001, "loss": 4.5059, "loss/crossentropy": 2.21281498670578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067965790629387, "step": 24722 }, { "epoch": 0.49448, "grad_norm": 1.921875, "grad_norm_var": 0.009696451822916667, "learning_rate": 0.0001, "loss": 4.0386, "loss/crossentropy": 2.288776397705078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19350187480449677, "step": 24724 }, { "epoch": 0.49452, "grad_norm": 1.953125, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 1.8925097584724426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952025145292282, "step": 24726 }, { "epoch": 0.49456, "grad_norm": 1.96875, "grad_norm_var": 0.009287261962890625, "learning_rate": 0.0001, "loss": 3.8005, "loss/crossentropy": 2.2274699211120605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19862639904022217, "step": 24728 }, { "epoch": 0.4946, "grad_norm": 1.9296875, "grad_norm_var": 0.008671061197916666, "learning_rate": 0.0001, "loss": 3.9947, "loss/crossentropy": 2.028487980365753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19470887631177902, "step": 24730 }, { "epoch": 0.49464, "grad_norm": 1.75, "grad_norm_var": 0.0082672119140625, "learning_rate": 0.0001, "loss": 4.2435, "loss/crossentropy": 2.413878560066223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20918180793523788, "step": 24732 }, { "epoch": 0.49468, "grad_norm": 1.9375, "grad_norm_var": 0.007478841145833333, "learning_rate": 0.0001, "loss": 4.0814, "loss/crossentropy": 2.2201125621795654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19350255280733109, "step": 24734 }, { "epoch": 0.49472, "grad_norm": 1.9453125, "grad_norm_var": 0.006859334309895834, "learning_rate": 0.0001, "loss": 4.0363, "loss/crossentropy": 2.4426982402801514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20905796438455582, "step": 24736 }, { "epoch": 0.49476, "grad_norm": 1.9296875, "grad_norm_var": 0.0044667561848958336, "learning_rate": 0.0001, "loss": 3.811, "loss/crossentropy": 1.586302399635315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1673523262143135, "step": 24738 }, { "epoch": 0.4948, "grad_norm": 2.203125, "grad_norm_var": 0.009822591145833334, "learning_rate": 0.0001, "loss": 3.8759, "loss/crossentropy": 2.0435025691986084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17974509298801422, "step": 24740 }, { "epoch": 0.49484, "grad_norm": 2.015625, "grad_norm_var": 0.009916178385416667, "learning_rate": 0.0001, "loss": 4.288, "loss/crossentropy": 2.2075798511505127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002267688512802, "step": 24742 }, { "epoch": 0.49488, "grad_norm": 1.84375, "grad_norm_var": 0.011563873291015625, "learning_rate": 0.0001, "loss": 3.6645, "loss/crossentropy": 1.8993717432022095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17360415309667587, "step": 24744 }, { "epoch": 0.49492, "grad_norm": 1.7734375, "grad_norm_var": 0.013024648030598959, "learning_rate": 0.0001, "loss": 3.7726, "loss/crossentropy": 1.8296250700950623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821255162358284, "step": 24746 }, { "epoch": 0.49496, "grad_norm": 2.125, "grad_norm_var": 0.013911692301432292, "learning_rate": 0.0001, "loss": 3.9835, "loss/crossentropy": 1.8268097043037415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1724962294101715, "step": 24748 }, { "epoch": 0.495, "grad_norm": 1.9609375, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 3.9835, "loss/crossentropy": 1.9587016701698303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193455271422863, "step": 24750 }, { "epoch": 0.49504, "grad_norm": 1.890625, "grad_norm_var": 0.014811197916666666, "learning_rate": 0.0001, "loss": 4.0863, "loss/crossentropy": 2.1292134523391724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20238783955574036, "step": 24752 }, { "epoch": 0.49508, "grad_norm": 2.03125, "grad_norm_var": 0.01566136678059896, "learning_rate": 0.0001, "loss": 3.9682, "loss/crossentropy": 2.0145905017852783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276977151632309, "step": 24754 }, { "epoch": 0.49512, "grad_norm": 2.015625, "grad_norm_var": 0.011028798421223958, "learning_rate": 0.0001, "loss": 4.0566, "loss/crossentropy": 1.9815504550933838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1814296990633011, "step": 24756 }, { "epoch": 0.49516, "grad_norm": 1.7734375, "grad_norm_var": 0.012532552083333334, "learning_rate": 0.0001, "loss": 3.7894, "loss/crossentropy": 1.8275295495986938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16582081466913223, "step": 24758 }, { "epoch": 0.4952, "grad_norm": 1.90625, "grad_norm_var": 0.009859212239583333, "learning_rate": 0.0001, "loss": 3.9488, "loss/crossentropy": 1.9509209990501404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19137832522392273, "step": 24760 }, { "epoch": 0.49524, "grad_norm": 2.09375, "grad_norm_var": 0.007433827718098958, "learning_rate": 0.0001, "loss": 4.405, "loss/crossentropy": 2.255669593811035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20520294457674026, "step": 24762 }, { "epoch": 0.49528, "grad_norm": 1.9609375, "grad_norm_var": 0.006624348958333333, "learning_rate": 0.0001, "loss": 3.9608, "loss/crossentropy": 1.902245819568634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1721034198999405, "step": 24764 }, { "epoch": 0.49532, "grad_norm": 1.9453125, "grad_norm_var": 0.007085927327473958, "learning_rate": 0.0001, "loss": 4.033, "loss/crossentropy": 2.0003857016563416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1873132660984993, "step": 24766 }, { "epoch": 0.49536, "grad_norm": 2.1875, "grad_norm_var": 0.01024169921875, "learning_rate": 0.0001, "loss": 4.1782, "loss/crossentropy": 2.3280882835388184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21413114666938782, "step": 24768 }, { "epoch": 0.4954, "grad_norm": 1.9453125, "grad_norm_var": 0.008882649739583333, "learning_rate": 0.0001, "loss": 4.0517, "loss/crossentropy": 2.0396437644958496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18958552181720734, "step": 24770 }, { "epoch": 0.49544, "grad_norm": 1.8828125, "grad_norm_var": 0.009012603759765625, "learning_rate": 0.0001, "loss": 3.9254, "loss/crossentropy": 2.247615098953247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21501318365335464, "step": 24772 }, { "epoch": 0.49548, "grad_norm": 1.953125, "grad_norm_var": 0.008479817708333334, "learning_rate": 0.0001, "loss": 3.8623, "loss/crossentropy": 2.0479390621185303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20071326941251755, "step": 24774 }, { "epoch": 0.49552, "grad_norm": 1.9609375, "grad_norm_var": 0.008591461181640624, "learning_rate": 0.0001, "loss": 4.2585, "loss/crossentropy": 2.129696846008301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128293514251709, "step": 24776 }, { "epoch": 0.49556, "grad_norm": 2.046875, "grad_norm_var": 0.008805338541666667, "learning_rate": 0.0001, "loss": 4.1479, "loss/crossentropy": 2.0912004709243774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24846868962049484, "step": 24778 }, { "epoch": 0.4956, "grad_norm": 1.9296875, "grad_norm_var": 0.008934529622395833, "learning_rate": 0.0001, "loss": 4.1595, "loss/crossentropy": 2.310052990913391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025884911417961, "step": 24780 }, { "epoch": 0.49564, "grad_norm": 1.8671875, "grad_norm_var": 0.010886383056640626, "learning_rate": 0.0001, "loss": 4.2532, "loss/crossentropy": 2.3318560123443604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21703048795461655, "step": 24782 }, { "epoch": 0.49568, "grad_norm": 2.0, "grad_norm_var": 0.009791819254557292, "learning_rate": 0.0001, "loss": 3.7083, "loss/crossentropy": 1.9909458756446838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18664265424013138, "step": 24784 }, { "epoch": 0.49572, "grad_norm": 1.9296875, "grad_norm_var": 0.009814198811848958, "learning_rate": 0.0001, "loss": 4.34, "loss/crossentropy": 2.530202269554138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20007390528917313, "step": 24786 }, { "epoch": 0.49576, "grad_norm": 1.953125, "grad_norm_var": 0.009382120768229167, "learning_rate": 0.0001, "loss": 4.1865, "loss/crossentropy": 2.286345958709717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20716369152069092, "step": 24788 }, { "epoch": 0.4958, "grad_norm": 1.8671875, "grad_norm_var": 0.008768717447916666, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 2.105964183807373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934148371219635, "step": 24790 }, { "epoch": 0.49584, "grad_norm": 1.8828125, "grad_norm_var": 0.009618123372395834, "learning_rate": 0.0001, "loss": 3.8057, "loss/crossentropy": 1.8528069853782654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16931568086147308, "step": 24792 }, { "epoch": 0.49588, "grad_norm": 1.8828125, "grad_norm_var": 0.008699544270833333, "learning_rate": 0.0001, "loss": 3.9729, "loss/crossentropy": 1.8847370743751526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16615931689739227, "step": 24794 }, { "epoch": 0.49592, "grad_norm": 2.046875, "grad_norm_var": 0.009040323893229167, "learning_rate": 0.0001, "loss": 4.2483, "loss/crossentropy": 2.159093201160431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20933274924755096, "step": 24796 }, { "epoch": 0.49596, "grad_norm": 2.109375, "grad_norm_var": 0.007995351155598959, "learning_rate": 0.0001, "loss": 4.2635, "loss/crossentropy": 2.1289132833480835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19792458415031433, "step": 24798 }, { "epoch": 0.496, "grad_norm": 1.796875, "grad_norm_var": 0.0071523030598958336, "learning_rate": 0.0001, "loss": 4.0548, "loss/crossentropy": 2.0613357424736023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18525776267051697, "step": 24800 }, { "epoch": 0.49604, "grad_norm": 1.96875, "grad_norm_var": 0.0070953369140625, "learning_rate": 0.0001, "loss": 4.0807, "loss/crossentropy": 1.7627623081207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16891766339540482, "step": 24802 }, { "epoch": 0.49608, "grad_norm": 1.90625, "grad_norm_var": 0.008446248372395833, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 1.77890944480896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24872711300849915, "step": 24804 }, { "epoch": 0.49612, "grad_norm": 1.953125, "grad_norm_var": 0.009291330973307291, "learning_rate": 0.0001, "loss": 4.0468, "loss/crossentropy": 2.240827202796936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20014363527297974, "step": 24806 }, { "epoch": 0.49616, "grad_norm": 1.90625, "grad_norm_var": 0.009968058268229166, "learning_rate": 0.0001, "loss": 3.8452, "loss/crossentropy": 1.7963212132453918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16260015219449997, "step": 24808 }, { "epoch": 0.4962, "grad_norm": 2.125, "grad_norm_var": 0.011681874593098959, "learning_rate": 0.0001, "loss": 4.0605, "loss/crossentropy": 2.1720168590545654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18297959119081497, "step": 24810 }, { "epoch": 0.49624, "grad_norm": 1.9921875, "grad_norm_var": 0.010849761962890624, "learning_rate": 0.0001, "loss": 3.8772, "loss/crossentropy": 2.0715248584747314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20910069346427917, "step": 24812 }, { "epoch": 0.49628, "grad_norm": 2.0, "grad_norm_var": 0.009203084309895833, "learning_rate": 0.0001, "loss": 3.8442, "loss/crossentropy": 1.8676977157592773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19553124904632568, "step": 24814 }, { "epoch": 0.49632, "grad_norm": 1.8203125, "grad_norm_var": 0.008809407552083334, "learning_rate": 0.0001, "loss": 4.1918, "loss/crossentropy": 2.383392572402954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20431752502918243, "step": 24816 }, { "epoch": 0.49636, "grad_norm": 1.921875, "grad_norm_var": 0.013744099934895834, "learning_rate": 0.0001, "loss": 4.1956, "loss/crossentropy": 1.8039666414260864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17365024238824844, "step": 24818 }, { "epoch": 0.4964, "grad_norm": 1.8515625, "grad_norm_var": 0.012967936197916667, "learning_rate": 0.0001, "loss": 3.7321, "loss/crossentropy": 1.803081214427948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188148595392704, "step": 24820 }, { "epoch": 0.49644, "grad_norm": 1.7421875, "grad_norm_var": 0.014137522379557291, "learning_rate": 0.0001, "loss": 3.8016, "loss/crossentropy": 1.9158729910850525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17063472419977188, "step": 24822 }, { "epoch": 0.49648, "grad_norm": 1.9921875, "grad_norm_var": 0.01593805948893229, "learning_rate": 0.0001, "loss": 3.8044, "loss/crossentropy": 2.013060450553894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192637138068676, "step": 24824 }, { "epoch": 0.49652, "grad_norm": 1.9609375, "grad_norm_var": 0.015126291910807292, "learning_rate": 0.0001, "loss": 3.8618, "loss/crossentropy": 2.2182289361953735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003488838672638, "step": 24826 }, { "epoch": 0.49656, "grad_norm": 1.8125, "grad_norm_var": 0.0152008056640625, "learning_rate": 0.0001, "loss": 3.7418, "loss/crossentropy": 1.9512917399406433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17393754422664642, "step": 24828 }, { "epoch": 0.4966, "grad_norm": 2.140625, "grad_norm_var": 0.017986806233723958, "learning_rate": 0.0001, "loss": 4.2311, "loss/crossentropy": 2.3564621210098267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22488048672676086, "step": 24830 }, { "epoch": 0.49664, "grad_norm": 1.9765625, "grad_norm_var": 0.0183502197265625, "learning_rate": 0.0001, "loss": 4.0941, "loss/crossentropy": 2.3347015380859375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045806273818016, "step": 24832 }, { "epoch": 0.49668, "grad_norm": 1.921875, "grad_norm_var": 0.0120513916015625, "learning_rate": 0.0001, "loss": 4.0312, "loss/crossentropy": 1.8157051801681519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16906943917274475, "step": 24834 }, { "epoch": 0.49672, "grad_norm": 1.9140625, "grad_norm_var": 0.011903635660807292, "learning_rate": 0.0001, "loss": 4.1453, "loss/crossentropy": 2.068196475505829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20066522806882858, "step": 24836 }, { "epoch": 0.49676, "grad_norm": 1.875, "grad_norm_var": 0.010198720296223958, "learning_rate": 0.0001, "loss": 3.9633, "loss/crossentropy": 1.771648347377777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15898138284683228, "step": 24838 }, { "epoch": 0.4968, "grad_norm": 2.03125, "grad_norm_var": 0.008267974853515625, "learning_rate": 0.0001, "loss": 3.8167, "loss/crossentropy": 2.206678628921509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22059056907892227, "step": 24840 }, { "epoch": 0.49684, "grad_norm": 2.046875, "grad_norm_var": 0.006502278645833333, "learning_rate": 0.0001, "loss": 4.0938, "loss/crossentropy": 2.3368433713912964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20108889043331146, "step": 24842 }, { "epoch": 0.49688, "grad_norm": 1.890625, "grad_norm_var": 0.007039388020833333, "learning_rate": 0.0001, "loss": 3.6263, "loss/crossentropy": 2.1412705183029175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19969124346971512, "step": 24844 }, { "epoch": 0.49692, "grad_norm": 2.03125, "grad_norm_var": 0.0048215230305989586, "learning_rate": 0.0001, "loss": 4.0444, "loss/crossentropy": 1.9018915295600891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17174704372882843, "step": 24846 }, { "epoch": 0.49696, "grad_norm": 1.84375, "grad_norm_var": 0.0062652587890625, "learning_rate": 0.0001, "loss": 4.3229, "loss/crossentropy": 2.3042339086532593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20182086527347565, "step": 24848 }, { "epoch": 0.497, "grad_norm": 1.9765625, "grad_norm_var": 0.009948476155598959, "learning_rate": 0.0001, "loss": 3.899, "loss/crossentropy": 2.0412787199020386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19640880823135376, "step": 24850 }, { "epoch": 0.49704, "grad_norm": 1.9296875, "grad_norm_var": 0.010065714518229166, "learning_rate": 0.0001, "loss": 3.76, "loss/crossentropy": 1.9754884243011475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17998956143856049, "step": 24852 }, { "epoch": 0.49708, "grad_norm": 2.015625, "grad_norm_var": 0.010038248697916667, "learning_rate": 0.0001, "loss": 4.1763, "loss/crossentropy": 2.060430645942688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21152494102716446, "step": 24854 }, { "epoch": 0.49712, "grad_norm": 1.8046875, "grad_norm_var": 0.010375722249348959, "learning_rate": 0.0001, "loss": 3.7536, "loss/crossentropy": 1.6876602172851562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1664302945137024, "step": 24856 }, { "epoch": 0.49716, "grad_norm": 1.984375, "grad_norm_var": 0.009663645426432292, "learning_rate": 0.0001, "loss": 3.9341, "loss/crossentropy": 1.9389417171478271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19122107326984406, "step": 24858 }, { "epoch": 0.4972, "grad_norm": 1.859375, "grad_norm_var": 0.010752105712890625, "learning_rate": 0.0001, "loss": 3.905, "loss/crossentropy": 2.069887936115265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20993375778198242, "step": 24860 }, { "epoch": 0.49724, "grad_norm": 1.953125, "grad_norm_var": 0.0125640869140625, "learning_rate": 0.0001, "loss": 4.2935, "loss/crossentropy": 2.1173367500305176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19083526730537415, "step": 24862 }, { "epoch": 0.49728, "grad_norm": 1.9765625, "grad_norm_var": 0.011138661702473959, "learning_rate": 0.0001, "loss": 4.2694, "loss/crossentropy": 2.0459975004196167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20265580713748932, "step": 24864 }, { "epoch": 0.49732, "grad_norm": 2.0625, "grad_norm_var": 0.009471638997395834, "learning_rate": 0.0001, "loss": 3.9095, "loss/crossentropy": 1.9144663214683533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18860550224781036, "step": 24866 }, { "epoch": 0.49736, "grad_norm": 1.8359375, "grad_norm_var": 0.010762278238932292, "learning_rate": 0.0001, "loss": 3.8107, "loss/crossentropy": 1.6619080901145935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17097505927085876, "step": 24868 }, { "epoch": 0.4974, "grad_norm": 2.1875, "grad_norm_var": 0.014192708333333333, "learning_rate": 0.0001, "loss": 4.2346, "loss/crossentropy": 2.031949281692505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831347048282623, "step": 24870 }, { "epoch": 0.49744, "grad_norm": 2.109375, "grad_norm_var": 0.014869944254557291, "learning_rate": 0.0001, "loss": 4.098, "loss/crossentropy": 2.330108165740967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21315570175647736, "step": 24872 }, { "epoch": 0.49748, "grad_norm": 1.859375, "grad_norm_var": 0.014876302083333333, "learning_rate": 0.0001, "loss": 3.8719, "loss/crossentropy": 2.301952600479126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18241943418979645, "step": 24874 }, { "epoch": 0.49752, "grad_norm": 1.8828125, "grad_norm_var": 0.013634999593098959, "learning_rate": 0.0001, "loss": 4.3927, "loss/crossentropy": 2.3587260246276855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19822461903095245, "step": 24876 }, { "epoch": 0.49756, "grad_norm": 1.84375, "grad_norm_var": 0.012729644775390625, "learning_rate": 0.0001, "loss": 3.7801, "loss/crossentropy": 1.8675512671470642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19025489687919617, "step": 24878 }, { "epoch": 0.4976, "grad_norm": 2.390625, "grad_norm_var": 0.027367146809895833, "learning_rate": 0.0001, "loss": 3.8605, "loss/crossentropy": 1.9007975459098816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18459592014551163, "step": 24880 }, { "epoch": 0.49764, "grad_norm": 2.03125, "grad_norm_var": 0.026009114583333333, "learning_rate": 0.0001, "loss": 4.1207, "loss/crossentropy": 1.9351030588150024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17839118093252182, "step": 24882 }, { "epoch": 0.49768, "grad_norm": 1.890625, "grad_norm_var": 0.02781956990559896, "learning_rate": 0.0001, "loss": 4.2804, "loss/crossentropy": 2.2661246061325073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19518450647592545, "step": 24884 }, { "epoch": 0.49772, "grad_norm": 2.0625, "grad_norm_var": 0.025333658854166666, "learning_rate": 0.0001, "loss": 4.2568, "loss/crossentropy": 2.1555867791175842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20517798513174057, "step": 24886 }, { "epoch": 0.49776, "grad_norm": 1.953125, "grad_norm_var": 0.023067220052083334, "learning_rate": 0.0001, "loss": 4.0291, "loss/crossentropy": 2.1638144850730896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18970053642988205, "step": 24888 }, { "epoch": 0.4978, "grad_norm": 2.015625, "grad_norm_var": 0.023280588785807292, "learning_rate": 0.0001, "loss": 3.9658, "loss/crossentropy": 2.230544328689575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20187361538410187, "step": 24890 }, { "epoch": 0.49784, "grad_norm": 1.9296875, "grad_norm_var": 0.02308349609375, "learning_rate": 0.0001, "loss": 4.1406, "loss/crossentropy": 2.1005473136901855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20453108847141266, "step": 24892 }, { "epoch": 0.49788, "grad_norm": 1.953125, "grad_norm_var": 0.0233642578125, "learning_rate": 0.0001, "loss": 3.898, "loss/crossentropy": 2.056541621685028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18511568009853363, "step": 24894 }, { "epoch": 0.49792, "grad_norm": 1.8984375, "grad_norm_var": 0.011407216389973959, "learning_rate": 0.0001, "loss": 4.2019, "loss/crossentropy": 2.151825726032257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19390379637479782, "step": 24896 }, { "epoch": 0.49796, "grad_norm": 2.015625, "grad_norm_var": 0.011726633707682291, "learning_rate": 0.0001, "loss": 4.159, "loss/crossentropy": 2.2062729597091675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959953010082245, "step": 24898 }, { "epoch": 0.498, "grad_norm": 1.8828125, "grad_norm_var": 0.007835896809895833, "learning_rate": 0.0001, "loss": 4.0506, "loss/crossentropy": 2.11991286277771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20495782047510147, "step": 24900 }, { "epoch": 0.49804, "grad_norm": 1.8515625, "grad_norm_var": 0.007785797119140625, "learning_rate": 0.0001, "loss": 3.8555, "loss/crossentropy": 1.9586367011070251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18644800782203674, "step": 24902 }, { "epoch": 0.49808, "grad_norm": 1.8515625, "grad_norm_var": 0.008665974934895833, "learning_rate": 0.0001, "loss": 3.987, "loss/crossentropy": 2.030204951763153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938316822052002, "step": 24904 }, { "epoch": 0.49812, "grad_norm": 1.953125, "grad_norm_var": 0.00999755859375, "learning_rate": 0.0001, "loss": 4.2644, "loss/crossentropy": 2.063572645187378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19029874354600906, "step": 24906 }, { "epoch": 0.49816, "grad_norm": 1.921875, "grad_norm_var": 0.010081990559895834, "learning_rate": 0.0001, "loss": 3.8608, "loss/crossentropy": 1.8356643319129944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18317610025405884, "step": 24908 }, { "epoch": 0.4982, "grad_norm": 2.0625, "grad_norm_var": 0.08252665201822916, "learning_rate": 0.0001, "loss": 4.1617, "loss/crossentropy": 1.8856282830238342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1754850372672081, "step": 24910 }, { "epoch": 0.49824, "grad_norm": 1.9296875, "grad_norm_var": 0.08241780598958333, "learning_rate": 0.0001, "loss": 3.9649, "loss/crossentropy": 1.8674440383911133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805490255355835, "step": 24912 }, { "epoch": 0.49828, "grad_norm": 1.84375, "grad_norm_var": 0.08478978474934896, "learning_rate": 0.0001, "loss": 3.839, "loss/crossentropy": 1.8774593472480774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18495067209005356, "step": 24914 }, { "epoch": 0.49832, "grad_norm": 1.71875, "grad_norm_var": 0.08945210774739583, "learning_rate": 0.0001, "loss": 3.9121, "loss/crossentropy": 2.059852659702301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19449050724506378, "step": 24916 }, { "epoch": 0.49836, "grad_norm": 1.9765625, "grad_norm_var": 0.08824869791666666, "learning_rate": 0.0001, "loss": 4.2468, "loss/crossentropy": 2.3840794563293457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23490352928638458, "step": 24918 }, { "epoch": 0.4984, "grad_norm": 1.8671875, "grad_norm_var": 0.08859049479166667, "learning_rate": 0.0001, "loss": 4.1537, "loss/crossentropy": 2.136437773704529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19557444751262665, "step": 24920 }, { "epoch": 0.49844, "grad_norm": 2.140625, "grad_norm_var": 0.08821614583333333, "learning_rate": 0.0001, "loss": 4.3658, "loss/crossentropy": 2.4357622861862183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21140636503696442, "step": 24922 }, { "epoch": 0.49848, "grad_norm": 1.859375, "grad_norm_var": 0.08875223795572916, "learning_rate": 0.0001, "loss": 3.9085, "loss/crossentropy": 2.1498345136642456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18728043884038925, "step": 24924 }, { "epoch": 0.49852, "grad_norm": 1.890625, "grad_norm_var": 0.012580362955729167, "learning_rate": 0.0001, "loss": 4.0771, "loss/crossentropy": 2.131583571434021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21287915110588074, "step": 24926 }, { "epoch": 0.49856, "grad_norm": 1.9375, "grad_norm_var": 0.012505849202473959, "learning_rate": 0.0001, "loss": 4.1492, "loss/crossentropy": 2.434916377067566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438215672969818, "step": 24928 }, { "epoch": 0.4986, "grad_norm": 1.6875, "grad_norm_var": 0.016397857666015626, "learning_rate": 0.0001, "loss": 3.8951, "loss/crossentropy": 1.9909663796424866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16714239120483398, "step": 24930 }, { "epoch": 0.49864, "grad_norm": 1.859375, "grad_norm_var": 0.01343994140625, "learning_rate": 0.0001, "loss": 4.0229, "loss/crossentropy": 1.9362955689430237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18765835464000702, "step": 24932 }, { "epoch": 0.49868, "grad_norm": 1.8828125, "grad_norm_var": 0.011527252197265626, "learning_rate": 0.0001, "loss": 3.888, "loss/crossentropy": 1.909186065196991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19517959654331207, "step": 24934 }, { "epoch": 0.49872, "grad_norm": 1.7890625, "grad_norm_var": 0.010350545247395834, "learning_rate": 0.0001, "loss": 4.0523, "loss/crossentropy": 1.9746766686439514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19376099109649658, "step": 24936 }, { "epoch": 0.49876, "grad_norm": 2.0, "grad_norm_var": 0.006844075520833334, "learning_rate": 0.0001, "loss": 4.0004, "loss/crossentropy": 2.0698294639587402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993079036474228, "step": 24938 }, { "epoch": 0.4988, "grad_norm": 1.921875, "grad_norm_var": 0.007453409830729166, "learning_rate": 0.0001, "loss": 4.1234, "loss/crossentropy": 1.8996255993843079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822480708360672, "step": 24940 }, { "epoch": 0.49884, "grad_norm": 2.125, "grad_norm_var": 0.0114898681640625, "learning_rate": 0.0001, "loss": 4.4206, "loss/crossentropy": 1.9174728989601135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18220385164022446, "step": 24942 }, { "epoch": 0.49888, "grad_norm": 1.8203125, "grad_norm_var": 0.0126953125, "learning_rate": 0.0001, "loss": 3.6162, "loss/crossentropy": 1.827277660369873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17845244705677032, "step": 24944 }, { "epoch": 0.49892, "grad_norm": 1.9921875, "grad_norm_var": 0.009264882405598958, "learning_rate": 0.0001, "loss": 4.1165, "loss/crossentropy": 2.0837597846984863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19145962595939636, "step": 24946 }, { "epoch": 0.49896, "grad_norm": 1.90625, "grad_norm_var": 0.008915201822916666, "learning_rate": 0.0001, "loss": 3.969, "loss/crossentropy": 2.417944073677063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22484175115823746, "step": 24948 }, { "epoch": 0.499, "grad_norm": 2.03125, "grad_norm_var": 0.009639485677083334, "learning_rate": 0.0001, "loss": 4.0408, "loss/crossentropy": 2.338744640350342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132393717765808, "step": 24950 }, { "epoch": 0.49904, "grad_norm": 2.09375, "grad_norm_var": 0.009559885660807291, "learning_rate": 0.0001, "loss": 3.9549, "loss/crossentropy": 2.1383684277534485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19005178660154343, "step": 24952 }, { "epoch": 0.49908, "grad_norm": 1.859375, "grad_norm_var": 0.010731760660807292, "learning_rate": 0.0001, "loss": 4.1188, "loss/crossentropy": 1.8223644495010376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17134226113557816, "step": 24954 }, { "epoch": 0.49912, "grad_norm": 2.265625, "grad_norm_var": 0.017228190104166666, "learning_rate": 0.0001, "loss": 3.9886, "loss/crossentropy": 2.09305202960968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19109848141670227, "step": 24956 }, { "epoch": 0.49916, "grad_norm": 2.03125, "grad_norm_var": 0.017844390869140626, "learning_rate": 0.0001, "loss": 3.7533, "loss/crossentropy": 1.9903115034103394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18018385022878647, "step": 24958 }, { "epoch": 0.4992, "grad_norm": 2.0, "grad_norm_var": 0.02005182902018229, "learning_rate": 0.0001, "loss": 4.0391, "loss/crossentropy": 1.9048819541931152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18057531118392944, "step": 24960 }, { "epoch": 0.49924, "grad_norm": 1.9609375, "grad_norm_var": 0.019505818684895832, "learning_rate": 0.0001, "loss": 4.0654, "loss/crossentropy": 2.045142412185669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18374620378017426, "step": 24962 }, { "epoch": 0.49928, "grad_norm": 1.9140625, "grad_norm_var": 0.019798787434895833, "learning_rate": 0.0001, "loss": 4.0365, "loss/crossentropy": 2.094703733921051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21248038858175278, "step": 24964 }, { "epoch": 0.49932, "grad_norm": 1.8671875, "grad_norm_var": 0.020359039306640625, "learning_rate": 0.0001, "loss": 3.7111, "loss/crossentropy": 2.113132894039154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19428254663944244, "step": 24966 }, { "epoch": 0.49936, "grad_norm": 2.015625, "grad_norm_var": 0.019123331705729166, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 2.0182060599327087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22048161923885345, "step": 24968 }, { "epoch": 0.4994, "grad_norm": 1.84375, "grad_norm_var": 0.01933568318684896, "learning_rate": 0.0001, "loss": 4.0755, "loss/crossentropy": 2.141752541065216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198372982442379, "step": 24970 }, { "epoch": 0.49944, "grad_norm": 2.03125, "grad_norm_var": 0.012737782796223958, "learning_rate": 0.0001, "loss": 4.0035, "loss/crossentropy": 1.9773234724998474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18223125487565994, "step": 24972 }, { "epoch": 0.49948, "grad_norm": 2.015625, "grad_norm_var": 0.009464518229166666, "learning_rate": 0.0001, "loss": 3.9405, "loss/crossentropy": 1.7909184098243713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17887402325868607, "step": 24974 }, { "epoch": 0.49952, "grad_norm": 1.921875, "grad_norm_var": 0.0048411051432291664, "learning_rate": 0.0001, "loss": 4.0024, "loss/crossentropy": 1.9185696840286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19022425264120102, "step": 24976 }, { "epoch": 0.49956, "grad_norm": 2.03125, "grad_norm_var": 0.005472819010416667, "learning_rate": 0.0001, "loss": 4.2593, "loss/crossentropy": 1.7053416967391968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18504460155963898, "step": 24978 }, { "epoch": 0.4996, "grad_norm": 1.8828125, "grad_norm_var": 0.006156158447265625, "learning_rate": 0.0001, "loss": 4.2626, "loss/crossentropy": 2.1260892152786255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958574280142784, "step": 24980 }, { "epoch": 0.49964, "grad_norm": 1.9296875, "grad_norm_var": 0.005537923177083333, "learning_rate": 0.0001, "loss": 3.9854, "loss/crossentropy": 1.9965519905090332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20814893394708633, "step": 24982 }, { "epoch": 0.49968, "grad_norm": 1.8671875, "grad_norm_var": 0.3696734110514323, "learning_rate": 0.0001, "loss": 4.1443, "loss/crossentropy": 1.882089376449585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23063261061906815, "step": 24984 }, { "epoch": 0.49972, "grad_norm": 2.03125, "grad_norm_var": 0.36444498697916666, "learning_rate": 0.0001, "loss": 3.9484, "loss/crossentropy": 2.0816534757614136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1818612888455391, "step": 24986 }, { "epoch": 0.49976, "grad_norm": 1.875, "grad_norm_var": 0.3668365478515625, "learning_rate": 0.0001, "loss": 3.794, "loss/crossentropy": 1.7514638304710388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18518543243408203, "step": 24988 }, { "epoch": 0.4998, "grad_norm": 1.96875, "grad_norm_var": 0.37027969360351565, "learning_rate": 0.0001, "loss": 4.095, "loss/crossentropy": 2.438727021217346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042185142636299, "step": 24990 }, { "epoch": 0.49984, "grad_norm": 2.1875, "grad_norm_var": 0.3688087463378906, "learning_rate": 0.0001, "loss": 4.1072, "loss/crossentropy": 2.0413439869880676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19952382892370224, "step": 24992 }, { "epoch": 0.49988, "grad_norm": 1.859375, "grad_norm_var": 0.3713612874348958, "learning_rate": 0.0001, "loss": 4.1332, "loss/crossentropy": 2.1857372522354126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18842580169439316, "step": 24994 }, { "epoch": 0.49992, "grad_norm": 1.96875, "grad_norm_var": 0.36927261352539065, "learning_rate": 0.0001, "loss": 4.1855, "loss/crossentropy": 1.9340556859970093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24346761405467987, "step": 24996 }, { "epoch": 0.49996, "grad_norm": 1.859375, "grad_norm_var": 0.3735389709472656, "learning_rate": 0.0001, "loss": 3.9177, "loss/crossentropy": 1.9736055731773376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20419831573963165, "step": 24998 }, { "epoch": 0.5, "grad_norm": 1.96875, "grad_norm_var": 0.0122314453125, "learning_rate": 0.0001, "loss": 4.0167, "loss/crossentropy": 1.8731598258018494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20324288308620453, "step": 25000 }, { "epoch": 0.50004, "grad_norm": 2.0, "grad_norm_var": 0.012645467122395834, "learning_rate": 0.0001, "loss": 4.1379, "loss/crossentropy": 1.9114505648612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19572290033102036, "step": 25002 }, { "epoch": 0.50008, "grad_norm": 2.1875, "grad_norm_var": 0.016022745768229166, "learning_rate": 0.0001, "loss": 4.0389, "loss/crossentropy": 2.0608550310134888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20759478956460953, "step": 25004 }, { "epoch": 0.50012, "grad_norm": 1.8828125, "grad_norm_var": 0.015529123942057292, "learning_rate": 0.0001, "loss": 4.0434, "loss/crossentropy": 2.0759811401367188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895374357700348, "step": 25006 }, { "epoch": 0.50016, "grad_norm": 1.8203125, "grad_norm_var": 0.012859853108723958, "learning_rate": 0.0001, "loss": 3.7772, "loss/crossentropy": 2.139121353626251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19550149887800217, "step": 25008 }, { "epoch": 0.5002, "grad_norm": 1.7890625, "grad_norm_var": 0.013410441080729167, "learning_rate": 0.0001, "loss": 3.9163, "loss/crossentropy": 1.8311315774917603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17756005376577377, "step": 25010 }, { "epoch": 0.50024, "grad_norm": 2.03125, "grad_norm_var": 0.012311808268229167, "learning_rate": 0.0001, "loss": 3.9897, "loss/crossentropy": 2.0824968814849854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21674959361553192, "step": 25012 }, { "epoch": 0.50028, "grad_norm": 2.0, "grad_norm_var": 0.018317667643229167, "learning_rate": 0.0001, "loss": 4.4713, "loss/crossentropy": 2.3479214906692505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22297757118940353, "step": 25014 }, { "epoch": 0.50032, "grad_norm": 2.03125, "grad_norm_var": 0.017682902018229165, "learning_rate": 0.0001, "loss": 4.2473, "loss/crossentropy": 2.477561354637146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2347298264503479, "step": 25016 }, { "epoch": 0.50036, "grad_norm": 1.875, "grad_norm_var": 0.020369466145833334, "learning_rate": 0.0001, "loss": 3.797, "loss/crossentropy": 1.7736787796020508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1656038835644722, "step": 25018 }, { "epoch": 0.5004, "grad_norm": 1.90625, "grad_norm_var": 0.017130533854166668, "learning_rate": 0.0001, "loss": 4.1537, "loss/crossentropy": 2.2568663358688354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19755709171295166, "step": 25020 }, { "epoch": 0.50044, "grad_norm": 2.078125, "grad_norm_var": 0.018668619791666667, "learning_rate": 0.0001, "loss": 3.9279, "loss/crossentropy": 1.961386263370514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079472467303276, "step": 25022 }, { "epoch": 0.50048, "grad_norm": 1.78125, "grad_norm_var": 0.01898981730143229, "learning_rate": 0.0001, "loss": 3.7884, "loss/crossentropy": 1.75395268201828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16586285829544067, "step": 25024 }, { "epoch": 0.50052, "grad_norm": 1.90625, "grad_norm_var": 0.0171142578125, "learning_rate": 0.0001, "loss": 3.9074, "loss/crossentropy": 2.1727951169013977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21714362502098083, "step": 25026 }, { "epoch": 0.50056, "grad_norm": 1.953125, "grad_norm_var": 0.017765299479166666, "learning_rate": 0.0001, "loss": 4.276, "loss/crossentropy": 2.147822380065918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19961202144622803, "step": 25028 }, { "epoch": 0.5006, "grad_norm": 2.078125, "grad_norm_var": 0.012775675455729166, "learning_rate": 0.0001, "loss": 4.1055, "loss/crossentropy": 2.0300870537757874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19732433557510376, "step": 25030 }, { "epoch": 0.50064, "grad_norm": 1.875, "grad_norm_var": 0.019828287760416667, "learning_rate": 0.0001, "loss": 4.0727, "loss/crossentropy": 1.840386688709259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18403710424900055, "step": 25032 }, { "epoch": 0.50068, "grad_norm": 1.9296875, "grad_norm_var": 0.01875, "learning_rate": 0.0001, "loss": 3.8442, "loss/crossentropy": 1.9718754291534424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19007521867752075, "step": 25034 }, { "epoch": 0.50072, "grad_norm": 1.8984375, "grad_norm_var": 0.022468058268229167, "learning_rate": 0.0001, "loss": 3.8137, "loss/crossentropy": 1.9550666809082031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944015771150589, "step": 25036 }, { "epoch": 0.50076, "grad_norm": 1.75, "grad_norm_var": 0.024397786458333334, "learning_rate": 0.0001, "loss": 3.7467, "loss/crossentropy": 1.9608187079429626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1774805188179016, "step": 25038 }, { "epoch": 0.5008, "grad_norm": 1.96875, "grad_norm_var": 0.0225494384765625, "learning_rate": 0.0001, "loss": 3.7607, "loss/crossentropy": 1.6980991959571838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16579563170671463, "step": 25040 }, { "epoch": 0.50084, "grad_norm": 1.890625, "grad_norm_var": 0.02269261678059896, "learning_rate": 0.0001, "loss": 3.9008, "loss/crossentropy": 1.8796940445899963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857006549835205, "step": 25042 }, { "epoch": 0.50088, "grad_norm": 1.953125, "grad_norm_var": 0.02159398396809896, "learning_rate": 0.0001, "loss": 3.7862, "loss/crossentropy": 1.8667701482772827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920718252658844, "step": 25044 }, { "epoch": 0.50092, "grad_norm": 1.9140625, "grad_norm_var": 0.019676717122395833, "learning_rate": 0.0001, "loss": 4.11, "loss/crossentropy": 2.142220616340637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20785054564476013, "step": 25046 }, { "epoch": 0.50096, "grad_norm": 1.953125, "grad_norm_var": 0.011185709635416667, "learning_rate": 0.0001, "loss": 4.1763, "loss/crossentropy": 2.1965246200561523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001591995358467, "step": 25048 }, { "epoch": 0.501, "grad_norm": 2.09375, "grad_norm_var": 0.01103515625, "learning_rate": 0.0001, "loss": 3.9254, "loss/crossentropy": 2.1812084913253784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21115607023239136, "step": 25050 }, { "epoch": 0.50104, "grad_norm": 1.921875, "grad_norm_var": 0.00670166015625, "learning_rate": 0.0001, "loss": 4.1107, "loss/crossentropy": 1.8218246698379517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20282388478517532, "step": 25052 }, { "epoch": 0.50108, "grad_norm": 2.015625, "grad_norm_var": 0.0037750244140625, "learning_rate": 0.0001, "loss": 3.9292, "loss/crossentropy": 2.2070316076278687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21164244413375854, "step": 25054 }, { "epoch": 0.50112, "grad_norm": 2.09375, "grad_norm_var": 0.00509033203125, "learning_rate": 0.0001, "loss": 4.0604, "loss/crossentropy": 2.1380701065063477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20642387121915817, "step": 25056 }, { "epoch": 0.50116, "grad_norm": 1.9765625, "grad_norm_var": 0.004671223958333333, "learning_rate": 0.0001, "loss": 4.188, "loss/crossentropy": 2.2019251585006714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20543986558914185, "step": 25058 }, { "epoch": 0.5012, "grad_norm": 1.9140625, "grad_norm_var": 0.006583658854166666, "learning_rate": 0.0001, "loss": 3.9042, "loss/crossentropy": 2.0563461780548096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18677344918251038, "step": 25060 }, { "epoch": 0.50124, "grad_norm": 1.9453125, "grad_norm_var": 0.0067291259765625, "learning_rate": 0.0001, "loss": 4.0295, "loss/crossentropy": 1.788195252418518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18303587287664413, "step": 25062 }, { "epoch": 0.50128, "grad_norm": 2.0, "grad_norm_var": 0.0079254150390625, "learning_rate": 0.0001, "loss": 3.7124, "loss/crossentropy": 1.888840913772583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736098676919937, "step": 25064 }, { "epoch": 0.50132, "grad_norm": 1.9453125, "grad_norm_var": 0.006139882405598958, "learning_rate": 0.0001, "loss": 4.0185, "loss/crossentropy": 2.0963358283042908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944149062037468, "step": 25066 }, { "epoch": 0.50136, "grad_norm": 1.8984375, "grad_norm_var": 0.006322224934895833, "learning_rate": 0.0001, "loss": 4.0357, "loss/crossentropy": 2.3116443157196045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22342295944690704, "step": 25068 }, { "epoch": 0.5014, "grad_norm": 2.03125, "grad_norm_var": 0.006266276041666667, "learning_rate": 0.0001, "loss": 4.2353, "loss/crossentropy": 2.251197099685669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25324299186468124, "step": 25070 }, { "epoch": 0.50144, "grad_norm": 1.890625, "grad_norm_var": 0.004613240559895833, "learning_rate": 0.0001, "loss": 3.7142, "loss/crossentropy": 2.2114810943603516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18877273797988892, "step": 25072 }, { "epoch": 0.50148, "grad_norm": 1.9921875, "grad_norm_var": 0.004923248291015625, "learning_rate": 0.0001, "loss": 4.1174, "loss/crossentropy": 2.266746401786804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21522346138954163, "step": 25074 }, { "epoch": 0.50152, "grad_norm": 1.84375, "grad_norm_var": 0.004809315999348958, "learning_rate": 0.0001, "loss": 4.2369, "loss/crossentropy": 1.922059416770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17906102538108826, "step": 25076 }, { "epoch": 0.50156, "grad_norm": 1.8671875, "grad_norm_var": 0.005891672770182292, "learning_rate": 0.0001, "loss": 4.0695, "loss/crossentropy": 2.077875077724457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18431179225444794, "step": 25078 }, { "epoch": 0.5016, "grad_norm": 1.96875, "grad_norm_var": 0.006135050455729167, "learning_rate": 0.0001, "loss": 3.7519, "loss/crossentropy": 1.8861806988716125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17609698325395584, "step": 25080 }, { "epoch": 0.50164, "grad_norm": 2.078125, "grad_norm_var": 0.008302561442057292, "learning_rate": 0.0001, "loss": 4.0371, "loss/crossentropy": 2.1382817029953003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21308781206607819, "step": 25082 }, { "epoch": 0.50168, "grad_norm": 2.015625, "grad_norm_var": 0.008656565348307292, "learning_rate": 0.0001, "loss": 3.9133, "loss/crossentropy": 1.9955537915229797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20031572878360748, "step": 25084 }, { "epoch": 0.50172, "grad_norm": 1.9140625, "grad_norm_var": 0.0108551025390625, "learning_rate": 0.0001, "loss": 4.0816, "loss/crossentropy": 2.2564213275909424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19504930824041367, "step": 25086 }, { "epoch": 0.50176, "grad_norm": 2.203125, "grad_norm_var": 0.01778132120768229, "learning_rate": 0.0001, "loss": 4.2432, "loss/crossentropy": 2.388614296913147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2021147906780243, "step": 25088 }, { "epoch": 0.5018, "grad_norm": 2.015625, "grad_norm_var": 0.016764322916666668, "learning_rate": 0.0001, "loss": 4.0045, "loss/crossentropy": 2.5410468578338623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23654652386903763, "step": 25090 }, { "epoch": 0.50184, "grad_norm": 1.90625, "grad_norm_var": 0.016255696614583332, "learning_rate": 0.0001, "loss": 4.0248, "loss/crossentropy": 1.9850627183914185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861669197678566, "step": 25092 }, { "epoch": 0.50188, "grad_norm": 1.9921875, "grad_norm_var": 0.01573486328125, "learning_rate": 0.0001, "loss": 4.2318, "loss/crossentropy": 2.3069719076156616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22062674164772034, "step": 25094 }, { "epoch": 0.50192, "grad_norm": 1.875, "grad_norm_var": 0.013960520426432291, "learning_rate": 0.0001, "loss": 3.9187, "loss/crossentropy": 1.7462975978851318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17929460108280182, "step": 25096 }, { "epoch": 0.50196, "grad_norm": 1.84375, "grad_norm_var": 0.012870025634765626, "learning_rate": 0.0001, "loss": 4.0647, "loss/crossentropy": 1.8277402520179749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17240776866674423, "step": 25098 }, { "epoch": 0.502, "grad_norm": 1.9296875, "grad_norm_var": 0.013978830973307292, "learning_rate": 0.0001, "loss": 4.0331, "loss/crossentropy": 2.2530910968780518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20934444665908813, "step": 25100 }, { "epoch": 0.50204, "grad_norm": 1.9375, "grad_norm_var": 0.0118316650390625, "learning_rate": 0.0001, "loss": 3.9368, "loss/crossentropy": 1.8995028138160706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18956587463617325, "step": 25102 }, { "epoch": 0.50208, "grad_norm": 1.8359375, "grad_norm_var": 0.0044553120930989586, "learning_rate": 0.0001, "loss": 3.8491, "loss/crossentropy": 1.9500588774681091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18196535110473633, "step": 25104 }, { "epoch": 0.50212, "grad_norm": 1.875, "grad_norm_var": 0.0037750244140625, "learning_rate": 0.0001, "loss": 3.8702, "loss/crossentropy": 1.794252336025238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17875805497169495, "step": 25106 }, { "epoch": 0.50216, "grad_norm": 1.9453125, "grad_norm_var": 0.004124959309895833, "learning_rate": 0.0001, "loss": 3.9752, "loss/crossentropy": 2.282618999481201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18928487598896027, "step": 25108 }, { "epoch": 0.5022, "grad_norm": 1.9296875, "grad_norm_var": 0.0036173502604166666, "learning_rate": 0.0001, "loss": 4.1439, "loss/crossentropy": 1.8668266534805298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1754981055855751, "step": 25110 }, { "epoch": 0.50224, "grad_norm": 1.9765625, "grad_norm_var": 0.0038859049479166665, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 2.416601300239563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21391690522432327, "step": 25112 }, { "epoch": 0.50228, "grad_norm": 1.921875, "grad_norm_var": 0.003311920166015625, "learning_rate": 0.0001, "loss": 3.914, "loss/crossentropy": 1.9013903141021729, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16768474876880646, "step": 25114 }, { "epoch": 0.50232, "grad_norm": 1.9453125, "grad_norm_var": 0.0035807291666666665, "learning_rate": 0.0001, "loss": 3.8587, "loss/crossentropy": 2.090600550174713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963217630982399, "step": 25116 }, { "epoch": 0.50236, "grad_norm": 1.9765625, "grad_norm_var": 0.003929646809895834, "learning_rate": 0.0001, "loss": 4.0404, "loss/crossentropy": 1.938132882118225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17899858951568604, "step": 25118 }, { "epoch": 0.5024, "grad_norm": 2.0, "grad_norm_var": 0.0031613667805989584, "learning_rate": 0.0001, "loss": 4.1818, "loss/crossentropy": 2.2119863033294678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20339767634868622, "step": 25120 }, { "epoch": 0.50244, "grad_norm": 2.0, "grad_norm_var": 0.0031809488932291667, "learning_rate": 0.0001, "loss": 4.2186, "loss/crossentropy": 2.35983407497406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063140720129013, "step": 25122 }, { "epoch": 0.50248, "grad_norm": 1.8515625, "grad_norm_var": 0.003830718994140625, "learning_rate": 0.0001, "loss": 4.2016, "loss/crossentropy": 2.0203242897987366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19055970013141632, "step": 25124 }, { "epoch": 0.50252, "grad_norm": 1.7890625, "grad_norm_var": 0.0050961812337239586, "learning_rate": 0.0001, "loss": 3.6522, "loss/crossentropy": 1.7058457732200623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857994720339775, "step": 25126 }, { "epoch": 0.50256, "grad_norm": 1.828125, "grad_norm_var": 0.0054514567057291664, "learning_rate": 0.0001, "loss": 3.7193, "loss/crossentropy": 1.720770239830017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1738799437880516, "step": 25128 }, { "epoch": 0.5026, "grad_norm": 1.9765625, "grad_norm_var": 0.006247711181640625, "learning_rate": 0.0001, "loss": 3.8967, "loss/crossentropy": 1.677247405052185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18985359370708466, "step": 25130 }, { "epoch": 0.50264, "grad_norm": 1.9609375, "grad_norm_var": 0.0058977762858072914, "learning_rate": 0.0001, "loss": 4.0737, "loss/crossentropy": 2.1808619499206543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19232448935508728, "step": 25132 }, { "epoch": 0.50268, "grad_norm": 1.6953125, "grad_norm_var": 0.008512369791666667, "learning_rate": 0.0001, "loss": 3.8655, "loss/crossentropy": 1.8475935459136963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18105845898389816, "step": 25134 }, { "epoch": 0.50272, "grad_norm": 2.03125, "grad_norm_var": 0.012052154541015625, "learning_rate": 0.0001, "loss": 3.8568, "loss/crossentropy": 2.0687568187713623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997617706656456, "step": 25136 }, { "epoch": 0.50276, "grad_norm": 2.015625, "grad_norm_var": 0.01256103515625, "learning_rate": 0.0001, "loss": 4.29, "loss/crossentropy": 2.405083179473877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22054974734783173, "step": 25138 }, { "epoch": 0.5028, "grad_norm": 2.078125, "grad_norm_var": 0.014037068684895833, "learning_rate": 0.0001, "loss": 4.0613, "loss/crossentropy": 1.917975127696991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000945031642914, "step": 25140 }, { "epoch": 0.50284, "grad_norm": 2.09375, "grad_norm_var": 0.017010243733723958, "learning_rate": 0.0001, "loss": 4.1259, "loss/crossentropy": 2.270500898361206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21464426815509796, "step": 25142 }, { "epoch": 0.50288, "grad_norm": 1.875, "grad_norm_var": 0.017145792643229168, "learning_rate": 0.0001, "loss": 4.0141, "loss/crossentropy": 2.0085031390190125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20128140598535538, "step": 25144 }, { "epoch": 0.50292, "grad_norm": 1.96875, "grad_norm_var": 0.0150299072265625, "learning_rate": 0.0001, "loss": 4.1969, "loss/crossentropy": 2.206787347793579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19804110378026962, "step": 25146 }, { "epoch": 0.50296, "grad_norm": 1.875, "grad_norm_var": 0.015428670247395833, "learning_rate": 0.0001, "loss": 3.8519, "loss/crossentropy": 1.7760102152824402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18732204288244247, "step": 25148 }, { "epoch": 0.503, "grad_norm": 2.046875, "grad_norm_var": 0.011439768473307292, "learning_rate": 0.0001, "loss": 4.1334, "loss/crossentropy": 1.9429696202278137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21402037143707275, "step": 25150 }, { "epoch": 0.50304, "grad_norm": 1.8203125, "grad_norm_var": 0.007315826416015625, "learning_rate": 0.0001, "loss": 3.8488, "loss/crossentropy": 1.6982505321502686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16420165449380875, "step": 25152 }, { "epoch": 0.50308, "grad_norm": 1.8984375, "grad_norm_var": 0.0076324462890625, "learning_rate": 0.0001, "loss": 4.0041, "loss/crossentropy": 1.998267948627472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193763867020607, "step": 25154 }, { "epoch": 0.50312, "grad_norm": 2.1875, "grad_norm_var": 0.009917958577473959, "learning_rate": 0.0001, "loss": 4.0845, "loss/crossentropy": 2.4127193689346313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2271365001797676, "step": 25156 }, { "epoch": 0.50316, "grad_norm": 1.859375, "grad_norm_var": 0.0094390869140625, "learning_rate": 0.0001, "loss": 3.8656, "loss/crossentropy": 1.5740959644317627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17892491817474365, "step": 25158 }, { "epoch": 0.5032, "grad_norm": 2.03125, "grad_norm_var": 0.010205078125, "learning_rate": 0.0001, "loss": 4.079, "loss/crossentropy": 2.1531224250793457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20502899587154388, "step": 25160 }, { "epoch": 0.50324, "grad_norm": 1.8671875, "grad_norm_var": 0.0105621337890625, "learning_rate": 0.0001, "loss": 3.9895, "loss/crossentropy": 2.024933695793152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21745888888835907, "step": 25162 }, { "epoch": 0.50328, "grad_norm": 1.890625, "grad_norm_var": 0.010465240478515625, "learning_rate": 0.0001, "loss": 3.7801, "loss/crossentropy": 1.7188761234283447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1754877120256424, "step": 25164 }, { "epoch": 0.50332, "grad_norm": 1.96875, "grad_norm_var": 0.009642537434895833, "learning_rate": 0.0001, "loss": 3.9803, "loss/crossentropy": 2.1078373193740845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1812133714556694, "step": 25166 }, { "epoch": 0.50336, "grad_norm": 2.046875, "grad_norm_var": 0.010949452718098959, "learning_rate": 0.0001, "loss": 3.753, "loss/crossentropy": 1.793417751789093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1709349974989891, "step": 25168 }, { "epoch": 0.5034, "grad_norm": 1.734375, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 3.2689, "loss/crossentropy": 1.357922375202179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14589300006628036, "step": 25170 }, { "epoch": 0.50344, "grad_norm": 2.109375, "grad_norm_var": 0.014247385660807292, "learning_rate": 0.0001, "loss": 4.4387, "loss/crossentropy": 2.3469592332839966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20105785876512527, "step": 25172 }, { "epoch": 0.50348, "grad_norm": 1.890625, "grad_norm_var": 0.014159138997395833, "learning_rate": 0.0001, "loss": 3.8041, "loss/crossentropy": 2.1891257762908936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21113110333681107, "step": 25174 }, { "epoch": 0.50352, "grad_norm": 1.890625, "grad_norm_var": 0.012878163655598959, "learning_rate": 0.0001, "loss": 3.9236, "loss/crossentropy": 2.201194167137146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21105756610631943, "step": 25176 }, { "epoch": 0.50356, "grad_norm": 1.859375, "grad_norm_var": 0.012702433268229167, "learning_rate": 0.0001, "loss": 4.0474, "loss/crossentropy": 1.8287039995193481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18074452877044678, "step": 25178 }, { "epoch": 0.5036, "grad_norm": 1.9296875, "grad_norm_var": 0.012813313802083334, "learning_rate": 0.0001, "loss": 4.1655, "loss/crossentropy": 1.8915085792541504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997976452112198, "step": 25180 }, { "epoch": 0.50364, "grad_norm": 1.7109375, "grad_norm_var": 0.014509836832682291, "learning_rate": 0.0001, "loss": 3.8989, "loss/crossentropy": 2.102576494216919, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18678200989961624, "step": 25182 }, { "epoch": 0.50368, "grad_norm": 2.078125, "grad_norm_var": 0.014898427327473958, "learning_rate": 0.0001, "loss": 4.3278, "loss/crossentropy": 2.1603941917419434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20021572709083557, "step": 25184 }, { "epoch": 0.50372, "grad_norm": 1.9609375, "grad_norm_var": 0.011263020833333333, "learning_rate": 0.0001, "loss": 3.6356, "loss/crossentropy": 1.8718876838684082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18388725817203522, "step": 25186 }, { "epoch": 0.50376, "grad_norm": 1.9609375, "grad_norm_var": 0.0069272359212239586, "learning_rate": 0.0001, "loss": 4.1273, "loss/crossentropy": 2.2126933336257935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20829670876264572, "step": 25188 }, { "epoch": 0.5038, "grad_norm": 1.8671875, "grad_norm_var": 0.0072743733723958336, "learning_rate": 0.0001, "loss": 4.0426, "loss/crossentropy": 2.1955376863479614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19484563171863556, "step": 25190 }, { "epoch": 0.50384, "grad_norm": 1.8671875, "grad_norm_var": 0.0067535400390625, "learning_rate": 0.0001, "loss": 4.0132, "loss/crossentropy": 1.9294677376747131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18266455084085464, "step": 25192 }, { "epoch": 0.50388, "grad_norm": 2.265625, "grad_norm_var": 0.012851715087890625, "learning_rate": 0.0001, "loss": 4.1586, "loss/crossentropy": 1.867956280708313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2331952303647995, "step": 25194 }, { "epoch": 0.50392, "grad_norm": 1.90625, "grad_norm_var": 0.012984212239583333, "learning_rate": 0.0001, "loss": 3.8322, "loss/crossentropy": 1.7810456156730652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1834517866373062, "step": 25196 }, { "epoch": 0.50396, "grad_norm": 1.890625, "grad_norm_var": 0.010711415608723959, "learning_rate": 0.0001, "loss": 3.7718, "loss/crossentropy": 2.148942291736603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2065306007862091, "step": 25198 }, { "epoch": 0.504, "grad_norm": 1.84375, "grad_norm_var": 0.011291249593098959, "learning_rate": 0.0001, "loss": 3.9163, "loss/crossentropy": 1.730657696723938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18555008620023727, "step": 25200 }, { "epoch": 0.50404, "grad_norm": 2.078125, "grad_norm_var": 0.014190419514973959, "learning_rate": 0.0001, "loss": 3.9587, "loss/crossentropy": 2.2561585903167725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22614946961402893, "step": 25202 }, { "epoch": 0.50408, "grad_norm": 1.90625, "grad_norm_var": 0.014324696858723958, "learning_rate": 0.0001, "loss": 4.1299, "loss/crossentropy": 2.2798372507095337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19820939004421234, "step": 25204 }, { "epoch": 0.50412, "grad_norm": 2.078125, "grad_norm_var": 0.014582316080729166, "learning_rate": 0.0001, "loss": 4.1232, "loss/crossentropy": 2.2171024680137634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22784201800823212, "step": 25206 }, { "epoch": 0.50416, "grad_norm": 1.8984375, "grad_norm_var": 0.016283162434895835, "learning_rate": 0.0001, "loss": 3.8819, "loss/crossentropy": 1.867311179637909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19023532420396805, "step": 25208 }, { "epoch": 0.5042, "grad_norm": 1.953125, "grad_norm_var": 0.009506988525390624, "learning_rate": 0.0001, "loss": 3.843, "loss/crossentropy": 2.0274672508239746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19995202869176865, "step": 25210 }, { "epoch": 0.50424, "grad_norm": 1.96875, "grad_norm_var": 0.009663899739583334, "learning_rate": 0.0001, "loss": 3.7862, "loss/crossentropy": 1.88859623670578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842176765203476, "step": 25212 }, { "epoch": 0.50428, "grad_norm": 1.953125, "grad_norm_var": 0.008786773681640625, "learning_rate": 0.0001, "loss": 3.9548, "loss/crossentropy": 1.917616069316864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19126974791288376, "step": 25214 }, { "epoch": 0.50432, "grad_norm": 1.921875, "grad_norm_var": 0.0074615478515625, "learning_rate": 0.0001, "loss": 3.928, "loss/crossentropy": 2.111130714416504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20154303312301636, "step": 25216 }, { "epoch": 0.50436, "grad_norm": 1.96875, "grad_norm_var": 0.004857381184895833, "learning_rate": 0.0001, "loss": 3.9316, "loss/crossentropy": 2.0559436678886414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1823289915919304, "step": 25218 }, { "epoch": 0.5044, "grad_norm": 1.9765625, "grad_norm_var": 0.007225545247395834, "learning_rate": 0.0001, "loss": 4.2238, "loss/crossentropy": 1.8830538392066956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17736117541790009, "step": 25220 }, { "epoch": 0.50444, "grad_norm": 2.140625, "grad_norm_var": 0.009488932291666667, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 1.9821715354919434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20550980418920517, "step": 25222 }, { "epoch": 0.50448, "grad_norm": 2.03125, "grad_norm_var": 0.008670806884765625, "learning_rate": 0.0001, "loss": 4.0223, "loss/crossentropy": 2.182245671749115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2306300848722458, "step": 25224 }, { "epoch": 0.50452, "grad_norm": 1.8515625, "grad_norm_var": 0.0091217041015625, "learning_rate": 0.0001, "loss": 3.9751, "loss/crossentropy": 1.6160125136375427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15688229352235794, "step": 25226 }, { "epoch": 0.50456, "grad_norm": 1.9296875, "grad_norm_var": 0.008988189697265624, "learning_rate": 0.0001, "loss": 4.037, "loss/crossentropy": 1.8484386801719666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19160201400518417, "step": 25228 }, { "epoch": 0.5046, "grad_norm": 2.0, "grad_norm_var": 0.008438873291015624, "learning_rate": 0.0001, "loss": 4.2143, "loss/crossentropy": 2.153999447822571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077987790107727, "step": 25230 }, { "epoch": 0.50464, "grad_norm": 1.8984375, "grad_norm_var": 0.008487701416015625, "learning_rate": 0.0001, "loss": 4.0174, "loss/crossentropy": 2.1414119601249695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2047003135085106, "step": 25232 }, { "epoch": 0.50468, "grad_norm": 1.9296875, "grad_norm_var": 0.00701904296875, "learning_rate": 0.0001, "loss": 4.1841, "loss/crossentropy": 2.128756880760193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19793596118688583, "step": 25234 }, { "epoch": 0.50472, "grad_norm": 1.8125, "grad_norm_var": 0.008160146077473958, "learning_rate": 0.0001, "loss": 4.0356, "loss/crossentropy": 1.9649195075035095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19532814621925354, "step": 25236 }, { "epoch": 0.50476, "grad_norm": 1.921875, "grad_norm_var": 0.006158192952473958, "learning_rate": 0.0001, "loss": 4.3162, "loss/crossentropy": 2.165442705154419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19320187717676163, "step": 25238 }, { "epoch": 0.5048, "grad_norm": 1.828125, "grad_norm_var": 0.005909983317057292, "learning_rate": 0.0001, "loss": 4.0444, "loss/crossentropy": 2.172460913658142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18116996437311172, "step": 25240 }, { "epoch": 0.50484, "grad_norm": 1.8828125, "grad_norm_var": 0.0055501302083333336, "learning_rate": 0.0001, "loss": 3.9613, "loss/crossentropy": 1.9772083163261414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18840450048446655, "step": 25242 }, { "epoch": 0.50488, "grad_norm": 1.9609375, "grad_norm_var": 0.005952962239583333, "learning_rate": 0.0001, "loss": 3.8147, "loss/crossentropy": 1.9985793828964233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19273578375577927, "step": 25244 }, { "epoch": 0.50492, "grad_norm": 1.953125, "grad_norm_var": 0.00513916015625, "learning_rate": 0.0001, "loss": 4.0365, "loss/crossentropy": 2.339000701904297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071395143866539, "step": 25246 }, { "epoch": 0.50496, "grad_norm": 1.90625, "grad_norm_var": 0.006056467692057292, "learning_rate": 0.0001, "loss": 4.2748, "loss/crossentropy": 2.1699774861335754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179349735379219, "step": 25248 }, { "epoch": 0.505, "grad_norm": 1.7890625, "grad_norm_var": 0.0066569010416666664, "learning_rate": 0.0001, "loss": 3.9062, "loss/crossentropy": 1.9445035457611084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17794214189052582, "step": 25250 }, { "epoch": 0.50504, "grad_norm": 2.109375, "grad_norm_var": 0.0077626546223958336, "learning_rate": 0.0001, "loss": 4.3328, "loss/crossentropy": 2.325773239135742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20460912585258484, "step": 25252 }, { "epoch": 0.50508, "grad_norm": 21.75, "grad_norm_var": 24.616644032796223, "learning_rate": 0.0001, "loss": 4.5896, "loss/crossentropy": 2.308157444000244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063561975955963, "step": 25254 }, { "epoch": 0.50512, "grad_norm": 2.1875, "grad_norm_var": 24.524991607666017, "learning_rate": 0.0001, "loss": 4.1727, "loss/crossentropy": 2.1321988105773926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19514495134353638, "step": 25256 }, { "epoch": 0.50516, "grad_norm": 2.03125, "grad_norm_var": 24.486148834228516, "learning_rate": 0.0001, "loss": 3.9135, "loss/crossentropy": 1.7947803735733032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18287408351898193, "step": 25258 }, { "epoch": 0.5052, "grad_norm": 1.8046875, "grad_norm_var": 24.50101318359375, "learning_rate": 0.0001, "loss": 4.0195, "loss/crossentropy": 2.0636664628982544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20408915728330612, "step": 25260 }, { "epoch": 0.50524, "grad_norm": 1.9296875, "grad_norm_var": 24.510210927327474, "learning_rate": 0.0001, "loss": 3.904, "loss/crossentropy": 2.113875925540924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19412121921777725, "step": 25262 }, { "epoch": 0.50528, "grad_norm": 1.8671875, "grad_norm_var": 24.563250478108724, "learning_rate": 0.0001, "loss": 3.9354, "loss/crossentropy": 1.9069438576698303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18107398599386215, "step": 25264 }, { "epoch": 0.50532, "grad_norm": 1.90625, "grad_norm_var": 24.5382687886556, "learning_rate": 0.0001, "loss": 3.7792, "loss/crossentropy": 1.9241713881492615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18617494404315948, "step": 25266 }, { "epoch": 0.50536, "grad_norm": 1.984375, "grad_norm_var": 24.575275675455728, "learning_rate": 0.0001, "loss": 3.8934, "loss/crossentropy": 1.9778280854225159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19652489572763443, "step": 25268 }, { "epoch": 0.5054, "grad_norm": 1.9140625, "grad_norm_var": 0.014729817708333334, "learning_rate": 0.0001, "loss": 3.7332, "loss/crossentropy": 2.0369997024536133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18344805389642715, "step": 25270 }, { "epoch": 0.50544, "grad_norm": 2.015625, "grad_norm_var": 0.007176717122395833, "learning_rate": 0.0001, "loss": 4.0336, "loss/crossentropy": 1.9891296029090881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19600480049848557, "step": 25272 }, { "epoch": 0.50548, "grad_norm": 1.8203125, "grad_norm_var": 0.005619049072265625, "learning_rate": 0.0001, "loss": 4.024, "loss/crossentropy": 2.325801730155945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21493100374937057, "step": 25274 }, { "epoch": 0.50552, "grad_norm": 1.953125, "grad_norm_var": 0.005736287434895833, "learning_rate": 0.0001, "loss": 4.067, "loss/crossentropy": 1.9277620315551758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140570506453514, "step": 25276 }, { "epoch": 0.50556, "grad_norm": 1.875, "grad_norm_var": 0.005763498942057291, "learning_rate": 0.0001, "loss": 3.9703, "loss/crossentropy": 2.3552772998809814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21981871128082275, "step": 25278 }, { "epoch": 0.5056, "grad_norm": 1.8046875, "grad_norm_var": 0.005338287353515625, "learning_rate": 0.0001, "loss": 3.7174, "loss/crossentropy": 1.7559251189231873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1645895466208458, "step": 25280 }, { "epoch": 0.50564, "grad_norm": 1.921875, "grad_norm_var": 0.005149078369140625, "learning_rate": 0.0001, "loss": 4.0222, "loss/crossentropy": 1.8345605731010437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18997152149677277, "step": 25282 }, { "epoch": 0.50568, "grad_norm": 2.03125, "grad_norm_var": 0.005150349934895834, "learning_rate": 0.0001, "loss": 4.0908, "loss/crossentropy": 2.297994017601013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172359898686409, "step": 25284 }, { "epoch": 0.50572, "grad_norm": 1.9375, "grad_norm_var": 0.004239908854166667, "learning_rate": 0.0001, "loss": 3.9462, "loss/crossentropy": 1.9640920758247375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1819777637720108, "step": 25286 }, { "epoch": 0.50576, "grad_norm": 1.7734375, "grad_norm_var": 0.004957834879557292, "learning_rate": 0.0001, "loss": 3.9544, "loss/crossentropy": 2.0045265555381775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20263119041919708, "step": 25288 }, { "epoch": 0.5058, "grad_norm": 1.75, "grad_norm_var": 0.006058756510416667, "learning_rate": 0.0001, "loss": 3.8583, "loss/crossentropy": 2.1333796977996826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355874300003052, "step": 25290 }, { "epoch": 0.50584, "grad_norm": 2.0, "grad_norm_var": 0.006827799479166666, "learning_rate": 0.0001, "loss": 4.3003, "loss/crossentropy": 2.1814208030700684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19890426099300385, "step": 25292 }, { "epoch": 0.50588, "grad_norm": 2.0, "grad_norm_var": 0.007283528645833333, "learning_rate": 0.0001, "loss": 4.4133, "loss/crossentropy": 2.1642476320266724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955343708395958, "step": 25294 }, { "epoch": 0.50592, "grad_norm": 1.9765625, "grad_norm_var": 0.006498209635416667, "learning_rate": 0.0001, "loss": 4.1446, "loss/crossentropy": 2.126586079597473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19216856360435486, "step": 25296 }, { "epoch": 0.50596, "grad_norm": 1.84375, "grad_norm_var": 0.007094065348307292, "learning_rate": 0.0001, "loss": 4.2278, "loss/crossentropy": 2.4168988466262817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21824205666780472, "step": 25298 }, { "epoch": 0.506, "grad_norm": 1.96875, "grad_norm_var": 0.005887858072916667, "learning_rate": 0.0001, "loss": 4.103, "loss/crossentropy": 2.013881802558899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011483684182167, "step": 25300 }, { "epoch": 0.50604, "grad_norm": 1.953125, "grad_norm_var": 0.007964833577473959, "learning_rate": 0.0001, "loss": 4.0249, "loss/crossentropy": 2.18127703666687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19816970825195312, "step": 25302 }, { "epoch": 0.50608, "grad_norm": 1.8671875, "grad_norm_var": 0.006249745686848958, "learning_rate": 0.0001, "loss": 4.0909, "loss/crossentropy": 1.8370496034622192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18162214756011963, "step": 25304 }, { "epoch": 0.50612, "grad_norm": 1.9140625, "grad_norm_var": 0.0038998921712239585, "learning_rate": 0.0001, "loss": 4.0421, "loss/crossentropy": 2.12531840801239, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21469102054834366, "step": 25306 }, { "epoch": 0.50616, "grad_norm": 1.9765625, "grad_norm_var": 0.004961903889973958, "learning_rate": 0.0001, "loss": 3.95, "loss/crossentropy": 1.956727385520935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18601331859827042, "step": 25308 }, { "epoch": 0.5062, "grad_norm": 2.359375, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.0001, "loss": 3.9688, "loss/crossentropy": 2.232389807701111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19958854466676712, "step": 25310 }, { "epoch": 0.50624, "grad_norm": 1.921875, "grad_norm_var": 0.016280110677083334, "learning_rate": 0.0001, "loss": 4.0164, "loss/crossentropy": 2.0312620997428894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19724415987730026, "step": 25312 }, { "epoch": 0.50628, "grad_norm": 2.09375, "grad_norm_var": 0.01625544230143229, "learning_rate": 0.0001, "loss": 4.232, "loss/crossentropy": 2.344046115875244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20940567553043365, "step": 25314 }, { "epoch": 0.50632, "grad_norm": 1.84375, "grad_norm_var": 0.017014312744140624, "learning_rate": 0.0001, "loss": 4.046, "loss/crossentropy": 1.99763023853302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1835399493575096, "step": 25316 }, { "epoch": 0.50636, "grad_norm": 1.90625, "grad_norm_var": 0.017044830322265624, "learning_rate": 0.0001, "loss": 3.6584, "loss/crossentropy": 1.9649406671524048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19432833045721054, "step": 25318 }, { "epoch": 0.5064, "grad_norm": 1.9453125, "grad_norm_var": 0.016434478759765624, "learning_rate": 0.0001, "loss": 3.9027, "loss/crossentropy": 1.8670902848243713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1731145828962326, "step": 25320 }, { "epoch": 0.50644, "grad_norm": 1.984375, "grad_norm_var": 0.016706339518229165, "learning_rate": 0.0001, "loss": 4.0066, "loss/crossentropy": 2.0195581912994385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19497420638799667, "step": 25322 }, { "epoch": 0.50648, "grad_norm": 2.078125, "grad_norm_var": 0.016290028889973957, "learning_rate": 0.0001, "loss": 4.1156, "loss/crossentropy": 2.1221953630447388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20753790438175201, "step": 25324 }, { "epoch": 0.50652, "grad_norm": 2.09375, "grad_norm_var": 0.007169596354166667, "learning_rate": 0.0001, "loss": 4.2117, "loss/crossentropy": 2.1407066583633423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934654265642166, "step": 25326 }, { "epoch": 0.50656, "grad_norm": 2.015625, "grad_norm_var": 0.007917277018229167, "learning_rate": 0.0001, "loss": 3.9074, "loss/crossentropy": 1.956154465675354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854771375656128, "step": 25328 }, { "epoch": 0.5066, "grad_norm": 1.921875, "grad_norm_var": 0.007450103759765625, "learning_rate": 0.0001, "loss": 4.0652, "loss/crossentropy": 2.0948599576950073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19326548278331757, "step": 25330 }, { "epoch": 0.50664, "grad_norm": 1.9609375, "grad_norm_var": 0.006705729166666666, "learning_rate": 0.0001, "loss": 4.0609, "loss/crossentropy": 2.137898027896881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20896823704242706, "step": 25332 }, { "epoch": 0.50668, "grad_norm": 1.984375, "grad_norm_var": 0.005859375, "learning_rate": 0.0001, "loss": 3.8687, "loss/crossentropy": 1.6471683382987976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1814308613538742, "step": 25334 }, { "epoch": 0.50672, "grad_norm": 1.8671875, "grad_norm_var": 0.007165273030598958, "learning_rate": 0.0001, "loss": 3.7461, "loss/crossentropy": 1.861536979675293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17766699194908142, "step": 25336 }, { "epoch": 0.50676, "grad_norm": 1.90625, "grad_norm_var": 0.006974029541015625, "learning_rate": 0.0001, "loss": 3.9319, "loss/crossentropy": 2.312160909175873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20586217939853668, "step": 25338 }, { "epoch": 0.5068, "grad_norm": 1.9453125, "grad_norm_var": 0.005293528238932292, "learning_rate": 0.0001, "loss": 4.1102, "loss/crossentropy": 1.8415343165397644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20047077536582947, "step": 25340 }, { "epoch": 0.50684, "grad_norm": 2.046875, "grad_norm_var": 0.0033925374348958335, "learning_rate": 0.0001, "loss": 4.0579, "loss/crossentropy": 1.94148850440979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19869551062583923, "step": 25342 }, { "epoch": 0.50688, "grad_norm": 1.9296875, "grad_norm_var": 0.002559407552083333, "learning_rate": 0.0001, "loss": 4.0379, "loss/crossentropy": 2.280241370201111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19969766587018967, "step": 25344 }, { "epoch": 0.50692, "grad_norm": 1.859375, "grad_norm_var": 0.0026140848795572916, "learning_rate": 0.0001, "loss": 4.1569, "loss/crossentropy": 2.0140087604522705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20252882689237595, "step": 25346 }, { "epoch": 0.50696, "grad_norm": 1.8828125, "grad_norm_var": 0.0073201497395833336, "learning_rate": 0.0001, "loss": 3.9593, "loss/crossentropy": 2.186760902404785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20365024358034134, "step": 25348 }, { "epoch": 0.507, "grad_norm": 1.90625, "grad_norm_var": 0.007161458333333333, "learning_rate": 0.0001, "loss": 4.2392, "loss/crossentropy": 2.324553370475769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20509789884090424, "step": 25350 }, { "epoch": 0.50704, "grad_norm": 2.0, "grad_norm_var": 0.006638336181640625, "learning_rate": 0.0001, "loss": 4.255, "loss/crossentropy": 2.2305015325546265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21707269549369812, "step": 25352 }, { "epoch": 0.50708, "grad_norm": 1.953125, "grad_norm_var": 0.006864166259765625, "learning_rate": 0.0001, "loss": 4.0548, "loss/crossentropy": 2.1571671962738037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20072072744369507, "step": 25354 }, { "epoch": 0.50712, "grad_norm": 1.953125, "grad_norm_var": 0.0072825113932291664, "learning_rate": 0.0001, "loss": 4.0577, "loss/crossentropy": 2.014027178287506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1823829561471939, "step": 25356 }, { "epoch": 0.50716, "grad_norm": 1.9375, "grad_norm_var": 0.008304595947265625, "learning_rate": 0.0001, "loss": 3.7066, "loss/crossentropy": 2.306033492088318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2326786294579506, "step": 25358 }, { "epoch": 0.5072, "grad_norm": 1.9609375, "grad_norm_var": 0.008335113525390625, "learning_rate": 0.0001, "loss": 3.9867, "loss/crossentropy": 2.1029749512672424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19578313827514648, "step": 25360 }, { "epoch": 0.50724, "grad_norm": 1.921875, "grad_norm_var": 0.009242502848307292, "learning_rate": 0.0001, "loss": 3.9696, "loss/crossentropy": 2.244466543197632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20059070736169815, "step": 25362 }, { "epoch": 0.50728, "grad_norm": 2.109375, "grad_norm_var": 0.006858062744140625, "learning_rate": 0.0001, "loss": 4.1295, "loss/crossentropy": 2.3815391063690186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21556073427200317, "step": 25364 }, { "epoch": 0.50732, "grad_norm": 2.0, "grad_norm_var": 0.013767242431640625, "learning_rate": 0.0001, "loss": 4.0822, "loss/crossentropy": 1.997973620891571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20311636477708817, "step": 25366 }, { "epoch": 0.50736, "grad_norm": 1.765625, "grad_norm_var": 0.016845703125, "learning_rate": 0.0001, "loss": 3.8043, "loss/crossentropy": 2.2644081115722656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022029459476471, "step": 25368 }, { "epoch": 0.5074, "grad_norm": 1.921875, "grad_norm_var": 0.01632868448893229, "learning_rate": 0.0001, "loss": 3.9491, "loss/crossentropy": 1.7542709112167358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15834683179855347, "step": 25370 }, { "epoch": 0.50744, "grad_norm": 1.90625, "grad_norm_var": 0.016502888997395833, "learning_rate": 0.0001, "loss": 4.0981, "loss/crossentropy": 2.2310311794281006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21090705692768097, "step": 25372 }, { "epoch": 0.50748, "grad_norm": 1.921875, "grad_norm_var": 0.014977773030598959, "learning_rate": 0.0001, "loss": 3.9221, "loss/crossentropy": 1.9018943905830383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821354478597641, "step": 25374 }, { "epoch": 0.50752, "grad_norm": 1.8125, "grad_norm_var": 0.01602961222330729, "learning_rate": 0.0001, "loss": 4.0671, "loss/crossentropy": 2.0701186656951904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19244033098220825, "step": 25376 }, { "epoch": 0.50756, "grad_norm": 1.890625, "grad_norm_var": 0.01553955078125, "learning_rate": 0.0001, "loss": 3.7639, "loss/crossentropy": 2.0421605110168457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1847202628850937, "step": 25378 }, { "epoch": 0.5076, "grad_norm": 1.953125, "grad_norm_var": 0.014314524332682292, "learning_rate": 0.0001, "loss": 3.9923, "loss/crossentropy": 1.9375264048576355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17147281020879745, "step": 25380 }, { "epoch": 0.50764, "grad_norm": 1.7890625, "grad_norm_var": 0.0062408447265625, "learning_rate": 0.0001, "loss": 4.0086, "loss/crossentropy": 1.9593093395233154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19151991605758667, "step": 25382 }, { "epoch": 0.50768, "grad_norm": 1.7734375, "grad_norm_var": 0.007500966389973958, "learning_rate": 0.0001, "loss": 3.9897, "loss/crossentropy": 1.9283559322357178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18128316849470139, "step": 25384 }, { "epoch": 0.50772, "grad_norm": 2.015625, "grad_norm_var": 0.010970052083333333, "learning_rate": 0.0001, "loss": 4.385, "loss/crossentropy": 1.912258267402649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18579021841287613, "step": 25386 }, { "epoch": 0.50776, "grad_norm": 1.9296875, "grad_norm_var": 0.010619099934895833, "learning_rate": 0.0001, "loss": 4.0037, "loss/crossentropy": 2.161900043487549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025725245475769, "step": 25388 }, { "epoch": 0.5078, "grad_norm": 1.8828125, "grad_norm_var": 0.010733795166015626, "learning_rate": 0.0001, "loss": 4.2077, "loss/crossentropy": 2.1753687858581543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21103990823030472, "step": 25390 }, { "epoch": 0.50784, "grad_norm": 1.9921875, "grad_norm_var": 0.010581207275390626, "learning_rate": 0.0001, "loss": 4.2122, "loss/crossentropy": 1.869983971118927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18909727036952972, "step": 25392 }, { "epoch": 0.50788, "grad_norm": 1.9609375, "grad_norm_var": 0.0122711181640625, "learning_rate": 0.0001, "loss": 4.0602, "loss/crossentropy": 2.0557621121406555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18932487815618515, "step": 25394 }, { "epoch": 0.50792, "grad_norm": 2.015625, "grad_norm_var": 0.011946360270182291, "learning_rate": 0.0001, "loss": 4.0104, "loss/crossentropy": 1.865005910396576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2251267060637474, "step": 25396 }, { "epoch": 0.50796, "grad_norm": 1.8359375, "grad_norm_var": 0.011503092447916667, "learning_rate": 0.0001, "loss": 3.836, "loss/crossentropy": 1.7247771620750427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16510912030935287, "step": 25398 }, { "epoch": 0.508, "grad_norm": 1.8515625, "grad_norm_var": 0.009266916910807292, "learning_rate": 0.0001, "loss": 4.0276, "loss/crossentropy": 2.110623359680176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18747511506080627, "step": 25400 }, { "epoch": 0.50804, "grad_norm": 2.09375, "grad_norm_var": 0.008058420817057292, "learning_rate": 0.0001, "loss": 4.1451, "loss/crossentropy": 2.0501877069473267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19226385653018951, "step": 25402 }, { "epoch": 0.50808, "grad_norm": 1.9453125, "grad_norm_var": 0.008119455973307292, "learning_rate": 0.0001, "loss": 3.8899, "loss/crossentropy": 2.479986786842346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23350444436073303, "step": 25404 }, { "epoch": 0.50812, "grad_norm": 1.9296875, "grad_norm_var": 0.008133951822916667, "learning_rate": 0.0001, "loss": 4.0184, "loss/crossentropy": 2.215437889099121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19325988739728928, "step": 25406 }, { "epoch": 0.50816, "grad_norm": 1.8046875, "grad_norm_var": 0.008485666910807292, "learning_rate": 0.0001, "loss": 3.8584, "loss/crossentropy": 2.1168838143348694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18737324327230453, "step": 25408 }, { "epoch": 0.5082, "grad_norm": 2.0, "grad_norm_var": 0.007173411051432292, "learning_rate": 0.0001, "loss": 3.9339, "loss/crossentropy": 1.9043878316879272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19250840693712234, "step": 25410 }, { "epoch": 0.50824, "grad_norm": 2.15625, "grad_norm_var": 0.010158030192057292, "learning_rate": 0.0001, "loss": 4.2541, "loss/crossentropy": 2.411430239677429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22225366532802582, "step": 25412 }, { "epoch": 0.50828, "grad_norm": 1.9140625, "grad_norm_var": 0.009091949462890625, "learning_rate": 0.0001, "loss": 3.9909, "loss/crossentropy": 1.843587577342987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18295426666736603, "step": 25414 }, { "epoch": 0.50832, "grad_norm": 1.9375, "grad_norm_var": 0.008733876546223958, "learning_rate": 0.0001, "loss": 3.94, "loss/crossentropy": 1.8655884861946106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17755136638879776, "step": 25416 }, { "epoch": 0.50836, "grad_norm": 1.859375, "grad_norm_var": 0.007176717122395833, "learning_rate": 0.0001, "loss": 3.9523, "loss/crossentropy": 1.8799603581428528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18243838846683502, "step": 25418 }, { "epoch": 0.5084, "grad_norm": 1.9921875, "grad_norm_var": 0.007226308186848958, "learning_rate": 0.0001, "loss": 3.9171, "loss/crossentropy": 2.2428990602493286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21759599447250366, "step": 25420 }, { "epoch": 0.50844, "grad_norm": 1.9609375, "grad_norm_var": 0.008265940348307292, "learning_rate": 0.0001, "loss": 3.9152, "loss/crossentropy": 1.9947530031204224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956867277622223, "step": 25422 }, { "epoch": 0.50848, "grad_norm": 1.9140625, "grad_norm_var": 0.006359608968098959, "learning_rate": 0.0001, "loss": 3.9553, "loss/crossentropy": 2.0679984092712402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21864020824432373, "step": 25424 }, { "epoch": 0.50852, "grad_norm": 1.984375, "grad_norm_var": 0.005712890625, "learning_rate": 0.0001, "loss": 4.0127, "loss/crossentropy": 1.9529247879981995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891709715127945, "step": 25426 }, { "epoch": 0.50856, "grad_norm": 1.859375, "grad_norm_var": 0.006308746337890625, "learning_rate": 0.0001, "loss": 4.0049, "loss/crossentropy": 2.1328742504119873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21699222922325134, "step": 25428 }, { "epoch": 0.5086, "grad_norm": 2.015625, "grad_norm_var": 0.00760498046875, "learning_rate": 0.0001, "loss": 4.0057, "loss/crossentropy": 1.8275007009506226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1621478796005249, "step": 25430 }, { "epoch": 0.50864, "grad_norm": 1.7265625, "grad_norm_var": 0.010374959309895833, "learning_rate": 0.0001, "loss": 3.7843, "loss/crossentropy": 2.134227454662323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19917849451303482, "step": 25432 }, { "epoch": 0.50868, "grad_norm": 2.015625, "grad_norm_var": 0.010499827067057292, "learning_rate": 0.0001, "loss": 4.099, "loss/crossentropy": 2.040915012359619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1758517250418663, "step": 25434 }, { "epoch": 0.50872, "grad_norm": 2.09375, "grad_norm_var": 0.012247721354166666, "learning_rate": 0.0001, "loss": 4.0379, "loss/crossentropy": 1.9219905734062195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26709431409835815, "step": 25436 }, { "epoch": 0.50876, "grad_norm": 1.9375, "grad_norm_var": 0.011683909098307292, "learning_rate": 0.0001, "loss": 3.8866, "loss/crossentropy": 1.8520461320877075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17154939472675323, "step": 25438 }, { "epoch": 0.5088, "grad_norm": 1.9921875, "grad_norm_var": 0.011891428629557292, "learning_rate": 0.0001, "loss": 3.9908, "loss/crossentropy": 2.091896414756775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19294072687625885, "step": 25440 }, { "epoch": 0.50884, "grad_norm": 1.890625, "grad_norm_var": 0.012029774983723958, "learning_rate": 0.0001, "loss": 3.8595, "loss/crossentropy": 2.209986925125122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19535083323717117, "step": 25442 }, { "epoch": 0.50888, "grad_norm": 1.8828125, "grad_norm_var": 0.010643513997395833, "learning_rate": 0.0001, "loss": 3.5984, "loss/crossentropy": 1.7103394865989685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17907221615314484, "step": 25444 }, { "epoch": 0.50892, "grad_norm": 1.8828125, "grad_norm_var": 0.0099365234375, "learning_rate": 0.0001, "loss": 4.0083, "loss/crossentropy": 2.185406744480133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19615823030471802, "step": 25446 }, { "epoch": 0.50896, "grad_norm": 1.9296875, "grad_norm_var": 0.009317779541015625, "learning_rate": 0.0001, "loss": 4.0897, "loss/crossentropy": 1.7030528783798218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19345925003290176, "step": 25448 }, { "epoch": 0.509, "grad_norm": 1.8828125, "grad_norm_var": 0.0098541259765625, "learning_rate": 0.0001, "loss": 3.8739, "loss/crossentropy": 1.9207965731620789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18073637783527374, "step": 25450 }, { "epoch": 0.50904, "grad_norm": 1.8984375, "grad_norm_var": 0.008307902018229167, "learning_rate": 0.0001, "loss": 3.8217, "loss/crossentropy": 1.8818482160568237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18431610614061356, "step": 25452 }, { "epoch": 0.50908, "grad_norm": 1.7578125, "grad_norm_var": 0.009965006510416667, "learning_rate": 0.0001, "loss": 3.8117, "loss/crossentropy": 2.2623773217201233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20106880366802216, "step": 25454 }, { "epoch": 0.50912, "grad_norm": 1.921875, "grad_norm_var": 0.010782623291015625, "learning_rate": 0.0001, "loss": 4.1155, "loss/crossentropy": 2.303765892982483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19978389143943787, "step": 25456 }, { "epoch": 0.50916, "grad_norm": 2.09375, "grad_norm_var": 0.012824503580729167, "learning_rate": 0.0001, "loss": 4.0437, "loss/crossentropy": 2.1355656385421753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20279624313116074, "step": 25458 }, { "epoch": 0.5092, "grad_norm": 2.078125, "grad_norm_var": 0.012621815999348958, "learning_rate": 0.0001, "loss": 3.9021, "loss/crossentropy": 1.7096583843231201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16103574633598328, "step": 25460 }, { "epoch": 0.50924, "grad_norm": 1.90625, "grad_norm_var": 0.012889607747395834, "learning_rate": 0.0001, "loss": 3.9125, "loss/crossentropy": 1.7230717539787292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16039150953292847, "step": 25462 }, { "epoch": 0.50928, "grad_norm": 1.9375, "grad_norm_var": 0.024662017822265625, "learning_rate": 0.0001, "loss": 4.0199, "loss/crossentropy": 1.983295977115631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20290112495422363, "step": 25464 }, { "epoch": 0.50932, "grad_norm": 1.9140625, "grad_norm_var": 0.024812825520833335, "learning_rate": 0.0001, "loss": 3.8669, "loss/crossentropy": 1.7826906442642212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1826622262597084, "step": 25466 }, { "epoch": 0.50936, "grad_norm": 1.8125, "grad_norm_var": 0.024472808837890624, "learning_rate": 0.0001, "loss": 3.6992, "loss/crossentropy": 1.7718060612678528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1686256304383278, "step": 25468 }, { "epoch": 0.5094, "grad_norm": 1.8515625, "grad_norm_var": 0.021605428059895834, "learning_rate": 0.0001, "loss": 3.7736, "loss/crossentropy": 1.9546560645103455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1890355497598648, "step": 25470 }, { "epoch": 0.50944, "grad_norm": 1.7578125, "grad_norm_var": 0.024265289306640625, "learning_rate": 0.0001, "loss": 3.7519, "loss/crossentropy": 2.2464581727981567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21081877499818802, "step": 25472 }, { "epoch": 0.50948, "grad_norm": 1.9453125, "grad_norm_var": 0.022810872395833334, "learning_rate": 0.0001, "loss": 4.0546, "loss/crossentropy": 2.150168299674988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20787308365106583, "step": 25474 }, { "epoch": 0.50952, "grad_norm": 1.828125, "grad_norm_var": 0.020169830322265624, "learning_rate": 0.0001, "loss": 3.8762, "loss/crossentropy": 2.08556866645813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18290850520133972, "step": 25476 }, { "epoch": 0.50956, "grad_norm": 1.9921875, "grad_norm_var": 0.0203125, "learning_rate": 0.0001, "loss": 4.1563, "loss/crossentropy": 2.174665331840515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2028878778219223, "step": 25478 }, { "epoch": 0.5096, "grad_norm": 1.8515625, "grad_norm_var": 0.0035764058430989582, "learning_rate": 0.0001, "loss": 3.9699, "loss/crossentropy": 1.9720661640167236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19616466760635376, "step": 25480 }, { "epoch": 0.50964, "grad_norm": 2.015625, "grad_norm_var": 0.0045562744140625, "learning_rate": 0.0001, "loss": 3.9704, "loss/crossentropy": 2.2741299867630005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2136249840259552, "step": 25482 }, { "epoch": 0.50968, "grad_norm": 1.875, "grad_norm_var": 0.004620107014973959, "learning_rate": 0.0001, "loss": 3.9, "loss/crossentropy": 1.8212140798568726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16650617122650146, "step": 25484 }, { "epoch": 0.50972, "grad_norm": 2.078125, "grad_norm_var": 0.0072174072265625, "learning_rate": 0.0001, "loss": 3.9479, "loss/crossentropy": 2.25445818901062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21185942739248276, "step": 25486 }, { "epoch": 0.50976, "grad_norm": 1.890625, "grad_norm_var": 0.0061838785807291664, "learning_rate": 0.0001, "loss": 4.273, "loss/crossentropy": 2.198422074317932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19693002849817276, "step": 25488 }, { "epoch": 0.5098, "grad_norm": 2.03125, "grad_norm_var": 0.007177480061848958, "learning_rate": 0.0001, "loss": 4.0788, "loss/crossentropy": 2.0654536485671997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20444564521312714, "step": 25490 }, { "epoch": 0.50984, "grad_norm": 1.84375, "grad_norm_var": 0.007889811197916667, "learning_rate": 0.0001, "loss": 4.1771, "loss/crossentropy": 1.9605732560157776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1785297840833664, "step": 25492 }, { "epoch": 0.50988, "grad_norm": 2.09375, "grad_norm_var": 0.009968821207682292, "learning_rate": 0.0001, "loss": 4.2828, "loss/crossentropy": 2.128088355064392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063232958316803, "step": 25494 }, { "epoch": 0.50992, "grad_norm": 1.921875, "grad_norm_var": 0.011039225260416667, "learning_rate": 0.0001, "loss": 3.7328, "loss/crossentropy": 1.9981979727745056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17392293363809586, "step": 25496 }, { "epoch": 0.50996, "grad_norm": 2.140625, "grad_norm_var": 0.014243316650390626, "learning_rate": 0.0001, "loss": 4.3475, "loss/crossentropy": 2.4065998792648315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20214569568634033, "step": 25498 }, { "epoch": 0.51, "grad_norm": 1.8515625, "grad_norm_var": 0.01361083984375, "learning_rate": 0.0001, "loss": 3.9476, "loss/crossentropy": 2.154718041419983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19910430163145065, "step": 25500 }, { "epoch": 0.51004, "grad_norm": 2.03125, "grad_norm_var": 0.011042277018229166, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 2.22087424993515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20276758074760437, "step": 25502 }, { "epoch": 0.51008, "grad_norm": 1.8671875, "grad_norm_var": 0.014336903889973959, "learning_rate": 0.0001, "loss": 3.6317, "loss/crossentropy": 1.9433262944221497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1835595816373825, "step": 25504 }, { "epoch": 0.51012, "grad_norm": 1.96875, "grad_norm_var": 0.014625803629557291, "learning_rate": 0.0001, "loss": 3.8672, "loss/crossentropy": 1.814663290977478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932072937488556, "step": 25506 }, { "epoch": 0.51016, "grad_norm": 2.078125, "grad_norm_var": 0.0143463134765625, "learning_rate": 0.0001, "loss": 3.9603, "loss/crossentropy": 1.8567258715629578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20248033851385117, "step": 25508 }, { "epoch": 0.5102, "grad_norm": 1.9921875, "grad_norm_var": 0.013232167561848958, "learning_rate": 0.0001, "loss": 4.1419, "loss/crossentropy": 2.056326985359192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19645985960960388, "step": 25510 }, { "epoch": 0.51024, "grad_norm": 1.9375, "grad_norm_var": 0.010884348551432292, "learning_rate": 0.0001, "loss": 4.2054, "loss/crossentropy": 2.068827450275421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19381123036146164, "step": 25512 }, { "epoch": 0.51028, "grad_norm": 1.90625, "grad_norm_var": 0.008898671468098958, "learning_rate": 0.0001, "loss": 3.9707, "loss/crossentropy": 2.4085018634796143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20001942664384842, "step": 25514 }, { "epoch": 0.51032, "grad_norm": 1.921875, "grad_norm_var": 0.008056386311848959, "learning_rate": 0.0001, "loss": 4.0348, "loss/crossentropy": 1.9481959342956543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17734962701797485, "step": 25516 }, { "epoch": 0.51036, "grad_norm": 1.7265625, "grad_norm_var": 0.011722819010416666, "learning_rate": 0.0001, "loss": 3.6203, "loss/crossentropy": 1.9250916838645935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881168857216835, "step": 25518 }, { "epoch": 0.5104, "grad_norm": 1.8984375, "grad_norm_var": 0.0101715087890625, "learning_rate": 0.0001, "loss": 4.2853, "loss/crossentropy": 2.4334945678710938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22089337557554245, "step": 25520 }, { "epoch": 0.51044, "grad_norm": 1.9296875, "grad_norm_var": 0.012629954020182292, "learning_rate": 0.0001, "loss": 3.9618, "loss/crossentropy": 1.8872219324111938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17582690715789795, "step": 25522 }, { "epoch": 0.51048, "grad_norm": 1.9765625, "grad_norm_var": 0.011156209309895833, "learning_rate": 0.0001, "loss": 4.0339, "loss/crossentropy": 2.0595308542251587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20405854284763336, "step": 25524 }, { "epoch": 0.51052, "grad_norm": 2.015625, "grad_norm_var": 0.012819163004557292, "learning_rate": 0.0001, "loss": 4.0668, "loss/crossentropy": 1.7874937653541565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21105076372623444, "step": 25526 }, { "epoch": 0.51056, "grad_norm": 2.03125, "grad_norm_var": 0.014076487223307291, "learning_rate": 0.0001, "loss": 3.8664, "loss/crossentropy": 1.866211712360382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18459182977676392, "step": 25528 }, { "epoch": 0.5106, "grad_norm": 2.015625, "grad_norm_var": 0.015641021728515624, "learning_rate": 0.0001, "loss": 4.0479, "loss/crossentropy": 1.8424429893493652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17495467513799667, "step": 25530 }, { "epoch": 0.51064, "grad_norm": 2.046875, "grad_norm_var": 0.015632120768229167, "learning_rate": 0.0001, "loss": 4.1848, "loss/crossentropy": 2.3827977180480957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2094314694404602, "step": 25532 }, { "epoch": 0.51068, "grad_norm": 1.8359375, "grad_norm_var": 0.00950927734375, "learning_rate": 0.0001, "loss": 3.9321, "loss/crossentropy": 2.0699292421340942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574011653661728, "step": 25534 }, { "epoch": 0.51072, "grad_norm": 2.015625, "grad_norm_var": 0.0102447509765625, "learning_rate": 0.0001, "loss": 3.7784, "loss/crossentropy": 2.058907687664032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1945437341928482, "step": 25536 }, { "epoch": 0.51076, "grad_norm": 1.90625, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 4.0229, "loss/crossentropy": 2.138782501220703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19527214765548706, "step": 25538 }, { "epoch": 0.5108, "grad_norm": 1.890625, "grad_norm_var": 0.00936279296875, "learning_rate": 0.0001, "loss": 3.9339, "loss/crossentropy": 1.9123243689537048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18704041838645935, "step": 25540 }, { "epoch": 0.51084, "grad_norm": 1.9296875, "grad_norm_var": 0.008056640625, "learning_rate": 0.0001, "loss": 4.2846, "loss/crossentropy": 2.5264101028442383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23050238192081451, "step": 25542 }, { "epoch": 0.51088, "grad_norm": 1.875, "grad_norm_var": 0.005826822916666667, "learning_rate": 0.0001, "loss": 4.0812, "loss/crossentropy": 1.9210030436515808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19104420393705368, "step": 25544 }, { "epoch": 0.51092, "grad_norm": 1.7734375, "grad_norm_var": 0.006809234619140625, "learning_rate": 0.0001, "loss": 3.6386, "loss/crossentropy": 1.7989648580551147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17732901126146317, "step": 25546 }, { "epoch": 0.51096, "grad_norm": 1.9375, "grad_norm_var": 0.006086985270182292, "learning_rate": 0.0001, "loss": 4.0115, "loss/crossentropy": 1.7673185467720032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19276081770658493, "step": 25548 }, { "epoch": 0.511, "grad_norm": 1.8359375, "grad_norm_var": 0.005586496988932292, "learning_rate": 0.0001, "loss": 3.9068, "loss/crossentropy": 2.451088309288025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21039804071187973, "step": 25550 }, { "epoch": 0.51104, "grad_norm": 1.8671875, "grad_norm_var": 0.004813385009765625, "learning_rate": 0.0001, "loss": 3.9767, "loss/crossentropy": 2.0937219858169556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21383897960186005, "step": 25552 }, { "epoch": 0.51108, "grad_norm": 1.7890625, "grad_norm_var": 0.007010650634765625, "learning_rate": 0.0001, "loss": 3.8091, "loss/crossentropy": 1.7688068747520447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17263571172952652, "step": 25554 }, { "epoch": 0.51112, "grad_norm": 2.046875, "grad_norm_var": 0.008137003580729166, "learning_rate": 0.0001, "loss": 4.1427, "loss/crossentropy": 2.165677785873413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19934290647506714, "step": 25556 }, { "epoch": 0.51116, "grad_norm": 1.8203125, "grad_norm_var": 0.009197743733723958, "learning_rate": 0.0001, "loss": 3.8542, "loss/crossentropy": 1.7860172986984253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18735045939683914, "step": 25558 }, { "epoch": 0.5112, "grad_norm": 2.15625, "grad_norm_var": 0.012165323893229166, "learning_rate": 0.0001, "loss": 4.0364, "loss/crossentropy": 2.2124353647232056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27785858511924744, "step": 25560 }, { "epoch": 0.51124, "grad_norm": 2.265625, "grad_norm_var": 0.01620457967122396, "learning_rate": 0.0001, "loss": 4.0802, "loss/crossentropy": 1.8342975974082947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1646983027458191, "step": 25562 }, { "epoch": 0.51128, "grad_norm": 1.8828125, "grad_norm_var": 0.01895319620768229, "learning_rate": 0.0001, "loss": 3.7718, "loss/crossentropy": 2.0271248817443848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18964418768882751, "step": 25564 }, { "epoch": 0.51132, "grad_norm": 1.8828125, "grad_norm_var": 0.01892267862955729, "learning_rate": 0.0001, "loss": 3.8739, "loss/crossentropy": 2.1983633041381836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039782777428627, "step": 25566 }, { "epoch": 0.51136, "grad_norm": 1.8125, "grad_norm_var": 0.01982421875, "learning_rate": 0.0001, "loss": 3.9194, "loss/crossentropy": 1.804263949394226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17312633991241455, "step": 25568 }, { "epoch": 0.5114, "grad_norm": 1.8828125, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 4.2465, "loss/crossentropy": 1.7486448287963867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16695599257946014, "step": 25570 }, { "epoch": 0.51144, "grad_norm": 1.765625, "grad_norm_var": 0.02077204386393229, "learning_rate": 0.0001, "loss": 3.7398, "loss/crossentropy": 1.9466286301612854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805291324853897, "step": 25572 }, { "epoch": 0.51148, "grad_norm": 1.9921875, "grad_norm_var": 0.020703125, "learning_rate": 0.0001, "loss": 4.1362, "loss/crossentropy": 2.203175187110901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21553437411785126, "step": 25574 }, { "epoch": 0.51152, "grad_norm": 1.9453125, "grad_norm_var": 0.01708958943684896, "learning_rate": 0.0001, "loss": 4.1116, "loss/crossentropy": 1.8743736743927002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19245517253875732, "step": 25576 }, { "epoch": 0.51156, "grad_norm": 1.90625, "grad_norm_var": 0.008211008707682292, "learning_rate": 0.0001, "loss": 3.8857, "loss/crossentropy": 1.866178572177887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18233061581850052, "step": 25578 }, { "epoch": 0.5116, "grad_norm": 1.984375, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 3.8674, "loss/crossentropy": 2.0799529552459717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19485916197299957, "step": 25580 }, { "epoch": 0.51164, "grad_norm": 1.8828125, "grad_norm_var": 0.007222493489583333, "learning_rate": 0.0001, "loss": 3.9318, "loss/crossentropy": 1.7319077253341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16157583892345428, "step": 25582 }, { "epoch": 0.51168, "grad_norm": 1.84375, "grad_norm_var": 0.0073893229166666664, "learning_rate": 0.0001, "loss": 3.7334, "loss/crossentropy": 2.0670501589775085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18853918462991714, "step": 25584 }, { "epoch": 0.51172, "grad_norm": 1.9140625, "grad_norm_var": 0.004671223958333333, "learning_rate": 0.0001, "loss": 4.0737, "loss/crossentropy": 2.0605591535568237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799873411655426, "step": 25586 }, { "epoch": 0.51176, "grad_norm": 1.796875, "grad_norm_var": 0.004288482666015625, "learning_rate": 0.0001, "loss": 3.9416, "loss/crossentropy": 1.9844827055931091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19314195960760117, "step": 25588 }, { "epoch": 0.5118, "grad_norm": 1.84375, "grad_norm_var": 0.0035906473795572916, "learning_rate": 0.0001, "loss": 3.9044, "loss/crossentropy": 2.292548894882202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19697485864162445, "step": 25590 }, { "epoch": 0.51184, "grad_norm": 1.953125, "grad_norm_var": 0.004400380452473958, "learning_rate": 0.0001, "loss": 4.1964, "loss/crossentropy": 2.472001791000366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20994122326374054, "step": 25592 }, { "epoch": 0.51188, "grad_norm": 2.25, "grad_norm_var": 0.011283365885416667, "learning_rate": 0.0001, "loss": 3.9873, "loss/crossentropy": 1.3704800009727478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1426696479320526, "step": 25594 }, { "epoch": 0.51192, "grad_norm": 1.875, "grad_norm_var": 0.0113922119140625, "learning_rate": 0.0001, "loss": 3.9129, "loss/crossentropy": 1.6338598132133484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17765308916568756, "step": 25596 }, { "epoch": 0.51196, "grad_norm": 1.9453125, "grad_norm_var": 0.011717732747395833, "learning_rate": 0.0001, "loss": 4.1234, "loss/crossentropy": 1.8394725322723389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996341496706009, "step": 25598 }, { "epoch": 0.512, "grad_norm": 1.7578125, "grad_norm_var": 0.012443033854166667, "learning_rate": 0.0001, "loss": 3.9978, "loss/crossentropy": 1.9061697125434875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18420706689357758, "step": 25600 }, { "epoch": 0.51204, "grad_norm": 1.8984375, "grad_norm_var": 0.012426503499348958, "learning_rate": 0.0001, "loss": 4.0949, "loss/crossentropy": 2.3584840297698975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21293433010578156, "step": 25602 }, { "epoch": 0.51208, "grad_norm": 1.8984375, "grad_norm_var": 0.012251536051432291, "learning_rate": 0.0001, "loss": 3.8624, "loss/crossentropy": 1.923392653465271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18330930918455124, "step": 25604 }, { "epoch": 0.51212, "grad_norm": 1.890625, "grad_norm_var": 0.011934153238932292, "learning_rate": 0.0001, "loss": 4.027, "loss/crossentropy": 2.2249737977981567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061079815030098, "step": 25606 }, { "epoch": 0.51216, "grad_norm": 1.7578125, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 3.8274, "loss/crossentropy": 1.7640107870101929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1722632348537445, "step": 25608 }, { "epoch": 0.5122, "grad_norm": 2.0, "grad_norm_var": 0.006278483072916666, "learning_rate": 0.0001, "loss": 4.0634, "loss/crossentropy": 2.0654106736183167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959560066461563, "step": 25610 }, { "epoch": 0.51224, "grad_norm": 1.9140625, "grad_norm_var": 0.006685384114583333, "learning_rate": 0.0001, "loss": 3.857, "loss/crossentropy": 1.8277021646499634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1689283475279808, "step": 25612 }, { "epoch": 0.51228, "grad_norm": 1.953125, "grad_norm_var": 0.0066802978515625, "learning_rate": 0.0001, "loss": 3.9565, "loss/crossentropy": 2.2842466831207275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116750180721283, "step": 25614 }, { "epoch": 0.51232, "grad_norm": 2.34375, "grad_norm_var": 0.01710205078125, "learning_rate": 0.0001, "loss": 3.7537, "loss/crossentropy": 1.8136274218559265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18518541753292084, "step": 25616 }, { "epoch": 0.51236, "grad_norm": 1.90625, "grad_norm_var": 0.017277018229166666, "learning_rate": 0.0001, "loss": 3.9369, "loss/crossentropy": 2.0048798322677612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975068524479866, "step": 25618 }, { "epoch": 0.5124, "grad_norm": 1.9375, "grad_norm_var": 0.016745758056640626, "learning_rate": 0.0001, "loss": 3.7449, "loss/crossentropy": 1.7246285676956177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17372817546129227, "step": 25620 }, { "epoch": 0.51244, "grad_norm": 1.9453125, "grad_norm_var": 0.0177978515625, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.160382390022278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20257163047790527, "step": 25622 }, { "epoch": 0.51248, "grad_norm": 1.9921875, "grad_norm_var": 0.015604400634765625, "learning_rate": 0.0001, "loss": 4.0976, "loss/crossentropy": 2.1341055631637573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18860094994306564, "step": 25624 }, { "epoch": 0.51252, "grad_norm": 1.8203125, "grad_norm_var": 0.01613337198893229, "learning_rate": 0.0001, "loss": 3.9408, "loss/crossentropy": 2.1212183237075806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18820258975028992, "step": 25626 }, { "epoch": 0.51256, "grad_norm": 1.84375, "grad_norm_var": 0.017045084635416666, "learning_rate": 0.0001, "loss": 3.88, "loss/crossentropy": 2.0193417072296143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18116050213575363, "step": 25628 }, { "epoch": 0.5126, "grad_norm": 1.8359375, "grad_norm_var": 0.017472330729166666, "learning_rate": 0.0001, "loss": 4.0049, "loss/crossentropy": 2.2681472301483154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21637655794620514, "step": 25630 }, { "epoch": 0.51264, "grad_norm": 1.8515625, "grad_norm_var": 0.006444295247395833, "learning_rate": 0.0001, "loss": 4.1065, "loss/crossentropy": 1.879043698310852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1792997345328331, "step": 25632 }, { "epoch": 0.51268, "grad_norm": 1.8671875, "grad_norm_var": 0.005980428059895833, "learning_rate": 0.0001, "loss": 3.9104, "loss/crossentropy": 1.8325838446617126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18299763649702072, "step": 25634 }, { "epoch": 0.51272, "grad_norm": 1.859375, "grad_norm_var": 0.006241607666015625, "learning_rate": 0.0001, "loss": 3.9663, "loss/crossentropy": 2.1776668429374695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21143290400505066, "step": 25636 }, { "epoch": 0.51276, "grad_norm": 1.8515625, "grad_norm_var": 0.003932444254557291, "learning_rate": 0.0001, "loss": 3.7604, "loss/crossentropy": 2.010956645011902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18642369657754898, "step": 25638 }, { "epoch": 0.5128, "grad_norm": 1.796875, "grad_norm_var": 0.0038401285807291665, "learning_rate": 0.0001, "loss": 3.9791, "loss/crossentropy": 2.3246684074401855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20379284024238586, "step": 25640 }, { "epoch": 0.51284, "grad_norm": 2.140625, "grad_norm_var": 0.008740234375, "learning_rate": 0.0001, "loss": 3.9221, "loss/crossentropy": 2.1043150424957275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1939583420753479, "step": 25642 }, { "epoch": 0.51288, "grad_norm": 1.8125, "grad_norm_var": 0.008308664957682291, "learning_rate": 0.0001, "loss": 3.8916, "loss/crossentropy": 1.947311520576477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19082298129796982, "step": 25644 }, { "epoch": 0.51292, "grad_norm": 1.84375, "grad_norm_var": 0.008581288655598958, "learning_rate": 0.0001, "loss": 3.991, "loss/crossentropy": 2.189740777015686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19475838541984558, "step": 25646 }, { "epoch": 0.51296, "grad_norm": 1.875, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 4.0033, "loss/crossentropy": 1.6541009545326233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16929760575294495, "step": 25648 }, { "epoch": 0.513, "grad_norm": 2.125, "grad_norm_var": 0.011130523681640626, "learning_rate": 0.0001, "loss": 4.1202, "loss/crossentropy": 2.102361261844635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20186372101306915, "step": 25650 }, { "epoch": 0.51304, "grad_norm": 2.109375, "grad_norm_var": 0.016169230143229168, "learning_rate": 0.0001, "loss": 4.3193, "loss/crossentropy": 2.1970421075820923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268364280462265, "step": 25652 }, { "epoch": 0.51308, "grad_norm": 1.921875, "grad_norm_var": 0.01594823201497396, "learning_rate": 0.0001, "loss": 4.0125, "loss/crossentropy": 1.9391701817512512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19213733822107315, "step": 25654 }, { "epoch": 0.51312, "grad_norm": 1.828125, "grad_norm_var": 0.015453084309895834, "learning_rate": 0.0001, "loss": 4.1499, "loss/crossentropy": 1.9313429594039917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.184333935379982, "step": 25656 }, { "epoch": 0.51316, "grad_norm": 2.046875, "grad_norm_var": 0.03003107706705729, "learning_rate": 0.0001, "loss": 4.0926, "loss/crossentropy": 2.221387028694153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127813696861267, "step": 25658 }, { "epoch": 0.5132, "grad_norm": 2.21875, "grad_norm_var": 0.0320220947265625, "learning_rate": 0.0001, "loss": 4.2681, "loss/crossentropy": 2.0020928978919983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1746811792254448, "step": 25660 }, { "epoch": 0.51324, "grad_norm": 3.421875, "grad_norm_var": 0.15138320922851561, "learning_rate": 0.0001, "loss": 3.8316, "loss/crossentropy": 1.960661768913269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19446442276239395, "step": 25662 }, { "epoch": 0.51328, "grad_norm": 2.125, "grad_norm_var": 0.1433428446451823, "learning_rate": 0.0001, "loss": 3.7593, "loss/crossentropy": 1.6914892792701721, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17224331945180893, "step": 25664 }, { "epoch": 0.51332, "grad_norm": 1.78125, "grad_norm_var": 0.14908447265625, "learning_rate": 0.0001, "loss": 3.7995, "loss/crossentropy": 1.8427329063415527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18707136064767838, "step": 25666 }, { "epoch": 0.51336, "grad_norm": 2.453125, "grad_norm_var": 0.1630938212076823, "learning_rate": 0.0001, "loss": 3.7455, "loss/crossentropy": 1.7277203798294067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1778879463672638, "step": 25668 }, { "epoch": 0.5134, "grad_norm": 1.9765625, "grad_norm_var": 0.16569417317708332, "learning_rate": 0.0001, "loss": 3.817, "loss/crossentropy": 1.9606621861457825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19271928071975708, "step": 25670 }, { "epoch": 0.51344, "grad_norm": 2.171875, "grad_norm_var": 0.1670000712076823, "learning_rate": 0.0001, "loss": 3.7844, "loss/crossentropy": 2.004323959350586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18009979277849197, "step": 25672 }, { "epoch": 0.51348, "grad_norm": 2.59375, "grad_norm_var": 0.1731035868326823, "learning_rate": 0.0001, "loss": 4.1429, "loss/crossentropy": 2.0127062797546387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20746992528438568, "step": 25674 }, { "epoch": 0.51352, "grad_norm": 1.828125, "grad_norm_var": 0.181298828125, "learning_rate": 0.0001, "loss": 3.9813, "loss/crossentropy": 2.240972399711609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19375959038734436, "step": 25676 }, { "epoch": 0.51356, "grad_norm": 1.9921875, "grad_norm_var": 0.06014404296875, "learning_rate": 0.0001, "loss": 4.1652, "loss/crossentropy": 2.1942732334136963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21413519978523254, "step": 25678 }, { "epoch": 0.5136, "grad_norm": 1.8125, "grad_norm_var": 0.061522420247395834, "learning_rate": 0.0001, "loss": 4.0474, "loss/crossentropy": 2.0388938188552856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887573003768921, "step": 25680 }, { "epoch": 0.51364, "grad_norm": 1.8515625, "grad_norm_var": 0.059458160400390626, "learning_rate": 0.0001, "loss": 4.0529, "loss/crossentropy": 2.147862672805786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947481632232666, "step": 25682 }, { "epoch": 0.51368, "grad_norm": 1.9140625, "grad_norm_var": 0.052308909098307294, "learning_rate": 0.0001, "loss": 4.0428, "loss/crossentropy": 1.7891934514045715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200366035103798, "step": 25684 }, { "epoch": 0.51372, "grad_norm": 1.9921875, "grad_norm_var": 0.047973378499348955, "learning_rate": 0.0001, "loss": 4.1498, "loss/crossentropy": 2.0402196645736694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20540674775838852, "step": 25686 }, { "epoch": 0.51376, "grad_norm": 1.8046875, "grad_norm_var": 0.044535064697265626, "learning_rate": 0.0001, "loss": 3.809, "loss/crossentropy": 1.9355202913284302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19018185883760452, "step": 25688 }, { "epoch": 0.5138, "grad_norm": 1.765625, "grad_norm_var": 0.021648915608723958, "learning_rate": 0.0001, "loss": 3.8918, "loss/crossentropy": 2.140886068344116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20067927986383438, "step": 25690 }, { "epoch": 0.51384, "grad_norm": 2.03125, "grad_norm_var": 0.021149698893229166, "learning_rate": 0.0001, "loss": 3.7235, "loss/crossentropy": 1.9624481201171875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.191593699157238, "step": 25692 }, { "epoch": 0.51388, "grad_norm": 1.9921875, "grad_norm_var": 0.021834055582682293, "learning_rate": 0.0001, "loss": 4.1363, "loss/crossentropy": 2.2475873231887817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987176612019539, "step": 25694 }, { "epoch": 0.51392, "grad_norm": 2.078125, "grad_norm_var": 0.020857747395833334, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 2.336251735687256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216811552643776, "step": 25696 }, { "epoch": 0.51396, "grad_norm": 2.015625, "grad_norm_var": 0.019893391927083334, "learning_rate": 0.0001, "loss": 3.9574, "loss/crossentropy": 1.9058191776275635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20057585835456848, "step": 25698 }, { "epoch": 0.514, "grad_norm": 1.828125, "grad_norm_var": 0.009749094645182291, "learning_rate": 0.0001, "loss": 3.8912, "loss/crossentropy": 1.8164226412773132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772766411304474, "step": 25700 }, { "epoch": 0.51404, "grad_norm": 2.078125, "grad_norm_var": 0.010578409830729166, "learning_rate": 0.0001, "loss": 4.1467, "loss/crossentropy": 1.9770846962928772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19005891680717468, "step": 25702 }, { "epoch": 0.51408, "grad_norm": 3.5625, "grad_norm_var": 0.1743242899576823, "learning_rate": 0.0001, "loss": 3.859, "loss/crossentropy": 1.9907479286193848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19942636042833328, "step": 25704 }, { "epoch": 0.51412, "grad_norm": 1.8125, "grad_norm_var": 0.17293472290039064, "learning_rate": 0.0001, "loss": 3.7331, "loss/crossentropy": 1.803969383239746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17413488030433655, "step": 25706 }, { "epoch": 0.51416, "grad_norm": 1.84375, "grad_norm_var": 0.1740386962890625, "learning_rate": 0.0001, "loss": 4.0005, "loss/crossentropy": 1.9947617053985596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18278180807828903, "step": 25708 }, { "epoch": 0.5142, "grad_norm": 2.0625, "grad_norm_var": 0.17277399698893228, "learning_rate": 0.0001, "loss": 3.9845, "loss/crossentropy": 2.102198004722595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19085385650396347, "step": 25710 }, { "epoch": 0.51424, "grad_norm": 1.90625, "grad_norm_var": 0.1716631571451823, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 2.087872624397278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18291640281677246, "step": 25712 }, { "epoch": 0.51428, "grad_norm": 1.984375, "grad_norm_var": 0.17193094889322916, "learning_rate": 0.0001, "loss": 4.1705, "loss/crossentropy": 2.1515949964523315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19207611680030823, "step": 25714 }, { "epoch": 0.51432, "grad_norm": 1.984375, "grad_norm_var": 0.1703814188639323, "learning_rate": 0.0001, "loss": 4.1645, "loss/crossentropy": 2.2736594676971436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21448197960853577, "step": 25716 }, { "epoch": 0.51436, "grad_norm": 2.0, "grad_norm_var": 0.16901219685872396, "learning_rate": 0.0001, "loss": 4.3332, "loss/crossentropy": 1.8625124096870422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1784079223871231, "step": 25718 }, { "epoch": 0.5144, "grad_norm": 1.7890625, "grad_norm_var": 0.005600738525390625, "learning_rate": 0.0001, "loss": 3.9551, "loss/crossentropy": 2.2445143461227417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20068903267383575, "step": 25720 }, { "epoch": 0.51444, "grad_norm": 1.8125, "grad_norm_var": 0.005924224853515625, "learning_rate": 0.0001, "loss": 3.7845, "loss/crossentropy": 1.7019882202148438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17819605767726898, "step": 25722 }, { "epoch": 0.51448, "grad_norm": 2.34375, "grad_norm_var": 0.016523996988932293, "learning_rate": 0.0001, "loss": 3.8062, "loss/crossentropy": 2.077397406101227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20366767793893814, "step": 25724 }, { "epoch": 0.51452, "grad_norm": 1.96875, "grad_norm_var": 0.0159423828125, "learning_rate": 0.0001, "loss": 4.2385, "loss/crossentropy": 2.1213592290878296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402803480625153, "step": 25726 }, { "epoch": 0.51456, "grad_norm": 1.8828125, "grad_norm_var": 0.016110992431640624, "learning_rate": 0.0001, "loss": 3.6229, "loss/crossentropy": 1.7625560760498047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17147068679332733, "step": 25728 }, { "epoch": 0.5146, "grad_norm": 1.7734375, "grad_norm_var": 0.02178955078125, "learning_rate": 0.0001, "loss": 3.8784, "loss/crossentropy": 2.108244776725769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20061994343996048, "step": 25730 }, { "epoch": 0.51464, "grad_norm": 1.84375, "grad_norm_var": 0.023787180582682293, "learning_rate": 0.0001, "loss": 3.7433, "loss/crossentropy": 2.0263237953186035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17062099277973175, "step": 25732 }, { "epoch": 0.51468, "grad_norm": 1.921875, "grad_norm_var": 0.023337554931640626, "learning_rate": 0.0001, "loss": 3.9004, "loss/crossentropy": 2.3399864435195923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22683630138635635, "step": 25734 }, { "epoch": 0.51472, "grad_norm": 2.109375, "grad_norm_var": 0.024761708577473958, "learning_rate": 0.0001, "loss": 3.9724, "loss/crossentropy": 2.179167151451111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077057957649231, "step": 25736 }, { "epoch": 0.51476, "grad_norm": 1.8984375, "grad_norm_var": 0.027428944905598957, "learning_rate": 0.0001, "loss": 3.5788, "loss/crossentropy": 1.7649011611938477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17765354365110397, "step": 25738 }, { "epoch": 0.5148, "grad_norm": 1.9140625, "grad_norm_var": 0.03184789021809896, "learning_rate": 0.0001, "loss": 4.0858, "loss/crossentropy": 2.01451712846756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20021513849496841, "step": 25740 }, { "epoch": 0.51484, "grad_norm": 2.078125, "grad_norm_var": 0.03407567342122396, "learning_rate": 0.0001, "loss": 3.8967, "loss/crossentropy": 2.043434739112854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18407747894525528, "step": 25742 }, { "epoch": 0.51488, "grad_norm": 1.9375, "grad_norm_var": 0.0338531494140625, "learning_rate": 0.0001, "loss": 3.9306, "loss/crossentropy": 1.7236329913139343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16566471755504608, "step": 25744 }, { "epoch": 0.51492, "grad_norm": 1.9375, "grad_norm_var": 0.028107706705729166, "learning_rate": 0.0001, "loss": 3.9946, "loss/crossentropy": 1.9381142258644104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18159441649913788, "step": 25746 }, { "epoch": 0.51496, "grad_norm": 1.96875, "grad_norm_var": 0.025341542561848958, "learning_rate": 0.0001, "loss": 4.1442, "loss/crossentropy": 2.262045383453369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20239067822694778, "step": 25748 }, { "epoch": 0.515, "grad_norm": 2.0, "grad_norm_var": 0.026682281494140626, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 1.980510652065277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910887509584427, "step": 25750 }, { "epoch": 0.51504, "grad_norm": 1.9765625, "grad_norm_var": 0.024494425455729166, "learning_rate": 0.0001, "loss": 3.9608, "loss/crossentropy": 2.1935293674468994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19832123070955276, "step": 25752 }, { "epoch": 0.51508, "grad_norm": 1.890625, "grad_norm_var": 0.023835245768229166, "learning_rate": 0.0001, "loss": 4.1082, "loss/crossentropy": 1.912952721118927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726488322019577, "step": 25754 }, { "epoch": 0.51512, "grad_norm": 1.734375, "grad_norm_var": 0.0124664306640625, "learning_rate": 0.0001, "loss": 3.6737, "loss/crossentropy": 1.7230817675590515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16161160171031952, "step": 25756 }, { "epoch": 0.51516, "grad_norm": 1.9140625, "grad_norm_var": 0.010846964518229167, "learning_rate": 0.0001, "loss": 3.8654, "loss/crossentropy": 1.7760237455368042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1803196743130684, "step": 25758 }, { "epoch": 0.5152, "grad_norm": 1.8671875, "grad_norm_var": 0.011126454671223958, "learning_rate": 0.0001, "loss": 4.0545, "loss/crossentropy": 2.3100990056991577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151026725769043, "step": 25760 }, { "epoch": 0.51524, "grad_norm": 1.890625, "grad_norm_var": 0.010957845052083333, "learning_rate": 0.0001, "loss": 4.2432, "loss/crossentropy": 2.1021112203598022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18987688422203064, "step": 25762 }, { "epoch": 0.51528, "grad_norm": 1.890625, "grad_norm_var": 0.011139933268229167, "learning_rate": 0.0001, "loss": 3.798, "loss/crossentropy": 1.9850506782531738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19890005141496658, "step": 25764 }, { "epoch": 0.51532, "grad_norm": 1.8828125, "grad_norm_var": 0.010253651936848959, "learning_rate": 0.0001, "loss": 4.2499, "loss/crossentropy": 2.37569797039032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20863597840070724, "step": 25766 }, { "epoch": 0.51536, "grad_norm": 2.015625, "grad_norm_var": 0.011994425455729167, "learning_rate": 0.0001, "loss": 4.202, "loss/crossentropy": 2.3777748346328735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22870129346847534, "step": 25768 }, { "epoch": 0.5154, "grad_norm": 1.9609375, "grad_norm_var": 0.009114329020182292, "learning_rate": 0.0001, "loss": 4.2291, "loss/crossentropy": 2.071221649646759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19465549290180206, "step": 25770 }, { "epoch": 0.51544, "grad_norm": 1.7265625, "grad_norm_var": 0.009650675455729167, "learning_rate": 0.0001, "loss": 3.8449, "loss/crossentropy": 2.2113492488861084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19282811880111694, "step": 25772 }, { "epoch": 0.51548, "grad_norm": 2.28125, "grad_norm_var": 0.0171539306640625, "learning_rate": 0.0001, "loss": 4.0252, "loss/crossentropy": 1.8800004124641418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772722378373146, "step": 25774 }, { "epoch": 0.51552, "grad_norm": 2.015625, "grad_norm_var": 0.01837946573893229, "learning_rate": 0.0001, "loss": 3.8032, "loss/crossentropy": 1.9569828510284424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836630329489708, "step": 25776 }, { "epoch": 0.51556, "grad_norm": 2.125, "grad_norm_var": 0.020114898681640625, "learning_rate": 0.0001, "loss": 4.2328, "loss/crossentropy": 2.035541355609894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19203660637140274, "step": 25778 }, { "epoch": 0.5156, "grad_norm": 1.84375, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 3.9283, "loss/crossentropy": 2.2393122911453247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18675968796014786, "step": 25780 }, { "epoch": 0.51564, "grad_norm": 1.9140625, "grad_norm_var": 0.019632975260416668, "learning_rate": 0.0001, "loss": 4.0009, "loss/crossentropy": 2.0540746450424194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20924384146928787, "step": 25782 }, { "epoch": 0.51568, "grad_norm": 2.015625, "grad_norm_var": 0.01869481404622396, "learning_rate": 0.0001, "loss": 4.2234, "loss/crossentropy": 2.168312907218933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21717043220996857, "step": 25784 }, { "epoch": 0.51572, "grad_norm": 1.96875, "grad_norm_var": 0.017252349853515626, "learning_rate": 0.0001, "loss": 3.8948, "loss/crossentropy": 1.9122300744056702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18530257046222687, "step": 25786 }, { "epoch": 0.51576, "grad_norm": 2.046875, "grad_norm_var": 0.018314615885416666, "learning_rate": 0.0001, "loss": 3.7462, "loss/crossentropy": 1.813349425792694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17834128439426422, "step": 25788 }, { "epoch": 0.5158, "grad_norm": 2.046875, "grad_norm_var": 0.011604563395182291, "learning_rate": 0.0001, "loss": 4.197, "loss/crossentropy": 2.217802047729492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857040449976921, "step": 25790 }, { "epoch": 0.51584, "grad_norm": 1.8828125, "grad_norm_var": 0.011132558186848959, "learning_rate": 0.0001, "loss": 3.8139, "loss/crossentropy": 2.0986403822898865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917191743850708, "step": 25792 }, { "epoch": 0.51588, "grad_norm": 2.125, "grad_norm_var": 0.011250813802083334, "learning_rate": 0.0001, "loss": 4.1166, "loss/crossentropy": 1.7962473034858704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1671363189816475, "step": 25794 }, { "epoch": 0.51592, "grad_norm": 1.84375, "grad_norm_var": 0.012215169270833333, "learning_rate": 0.0001, "loss": 3.7758, "loss/crossentropy": 2.001879632472992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19479414075613022, "step": 25796 }, { "epoch": 0.51596, "grad_norm": 1.9296875, "grad_norm_var": 0.012018839518229166, "learning_rate": 0.0001, "loss": 3.9258, "loss/crossentropy": 2.228678584098816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20541846752166748, "step": 25798 }, { "epoch": 0.516, "grad_norm": 2.109375, "grad_norm_var": 0.013679758707682291, "learning_rate": 0.0001, "loss": 4.1654, "loss/crossentropy": 2.283624291419983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23651224374771118, "step": 25800 }, { "epoch": 0.51604, "grad_norm": 2.078125, "grad_norm_var": 0.015006510416666667, "learning_rate": 0.0001, "loss": 3.9617, "loss/crossentropy": 1.9860345125198364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878475695848465, "step": 25802 }, { "epoch": 0.51608, "grad_norm": 1.9921875, "grad_norm_var": 0.01031494140625, "learning_rate": 0.0001, "loss": 3.9197, "loss/crossentropy": 2.1123459935188293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22042006999254227, "step": 25804 }, { "epoch": 0.51612, "grad_norm": 2.03125, "grad_norm_var": 0.009886678059895833, "learning_rate": 0.0001, "loss": 3.9532, "loss/crossentropy": 2.092814803123474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2060900628566742, "step": 25806 }, { "epoch": 0.51616, "grad_norm": 1.90625, "grad_norm_var": 0.0083404541015625, "learning_rate": 0.0001, "loss": 3.8515, "loss/crossentropy": 1.9071126580238342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18779786676168442, "step": 25808 }, { "epoch": 0.5162, "grad_norm": 1.8984375, "grad_norm_var": 0.006819661458333333, "learning_rate": 0.0001, "loss": 3.9823, "loss/crossentropy": 1.9593830704689026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17265263199806213, "step": 25810 }, { "epoch": 0.51624, "grad_norm": 1.9140625, "grad_norm_var": 0.00438232421875, "learning_rate": 0.0001, "loss": 4.0247, "loss/crossentropy": 1.8603730201721191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18761274218559265, "step": 25812 }, { "epoch": 0.51628, "grad_norm": 1.8359375, "grad_norm_var": 0.007425689697265625, "learning_rate": 0.0001, "loss": 3.7386, "loss/crossentropy": 1.8858801126480103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186118483543396, "step": 25814 }, { "epoch": 0.51632, "grad_norm": 2.796875, "grad_norm_var": 0.051513671875, "learning_rate": 0.0001, "loss": 4.0368, "loss/crossentropy": 2.1853126287460327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23933759331703186, "step": 25816 }, { "epoch": 0.51636, "grad_norm": 1.9453125, "grad_norm_var": 0.05208104451497396, "learning_rate": 0.0001, "loss": 4.123, "loss/crossentropy": 2.0599172115325928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19681133329868317, "step": 25818 }, { "epoch": 0.5164, "grad_norm": 2.234375, "grad_norm_var": 0.055663045247395834, "learning_rate": 0.0001, "loss": 4.2483, "loss/crossentropy": 2.371991515159607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2135465070605278, "step": 25820 }, { "epoch": 0.51644, "grad_norm": 1.9921875, "grad_norm_var": 0.05572077433268229, "learning_rate": 0.0001, "loss": 4.2967, "loss/crossentropy": 2.1175169944763184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994188129901886, "step": 25822 }, { "epoch": 0.51648, "grad_norm": 1.8515625, "grad_norm_var": 0.059357706705729166, "learning_rate": 0.0001, "loss": 3.735, "loss/crossentropy": 2.074811100959778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20690523087978363, "step": 25824 }, { "epoch": 0.51652, "grad_norm": 1.8984375, "grad_norm_var": 0.05923258463541667, "learning_rate": 0.0001, "loss": 3.9436, "loss/crossentropy": 2.041485607624054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19222356379032135, "step": 25826 }, { "epoch": 0.51656, "grad_norm": 1.84375, "grad_norm_var": 0.060212961832682294, "learning_rate": 0.0001, "loss": 3.9312, "loss/crossentropy": 1.9386745691299438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20328567177057266, "step": 25828 }, { "epoch": 0.5166, "grad_norm": 1.9765625, "grad_norm_var": 0.055692291259765624, "learning_rate": 0.0001, "loss": 4.1459, "loss/crossentropy": 1.8701601028442383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17632923275232315, "step": 25830 }, { "epoch": 0.51664, "grad_norm": 2.125, "grad_norm_var": 0.012333170572916666, "learning_rate": 0.0001, "loss": 4.0424, "loss/crossentropy": 2.258900284767151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108033150434494, "step": 25832 }, { "epoch": 0.51668, "grad_norm": 2.03125, "grad_norm_var": 0.016806793212890626, "learning_rate": 0.0001, "loss": 4.4649, "loss/crossentropy": 2.10329806804657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19907134026288986, "step": 25834 }, { "epoch": 0.51672, "grad_norm": 2.109375, "grad_norm_var": 0.014208984375, "learning_rate": 0.0001, "loss": 3.9527, "loss/crossentropy": 1.692655324935913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17038661241531372, "step": 25836 }, { "epoch": 0.51676, "grad_norm": 2.109375, "grad_norm_var": 0.01628392537434896, "learning_rate": 0.0001, "loss": 4.122, "loss/crossentropy": 2.0541951060295105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20199306309223175, "step": 25838 }, { "epoch": 0.5168, "grad_norm": 2.171875, "grad_norm_var": 0.014721425374348958, "learning_rate": 0.0001, "loss": 4.2206, "loss/crossentropy": 2.0732097029685974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21288836747407913, "step": 25840 }, { "epoch": 0.51684, "grad_norm": 2.015625, "grad_norm_var": 0.013939412434895833, "learning_rate": 0.0001, "loss": 4.2017, "loss/crossentropy": 2.282121777534485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21497543156147003, "step": 25842 }, { "epoch": 0.51688, "grad_norm": 1.9140625, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 3.978, "loss/crossentropy": 1.8445320129394531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17342890799045563, "step": 25844 }, { "epoch": 0.51692, "grad_norm": 1.8203125, "grad_norm_var": 0.0162109375, "learning_rate": 0.0001, "loss": 3.9638, "loss/crossentropy": 1.9874401092529297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18226970732212067, "step": 25846 }, { "epoch": 0.51696, "grad_norm": 1.9296875, "grad_norm_var": 0.015290323893229167, "learning_rate": 0.0001, "loss": 4.0861, "loss/crossentropy": 2.0648937225341797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19047797471284866, "step": 25848 }, { "epoch": 0.517, "grad_norm": 9.125, "grad_norm_var": 3.220908355712891, "learning_rate": 0.0001, "loss": 4.2993, "loss/crossentropy": 2.3565629720687866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20815564692020416, "step": 25850 }, { "epoch": 0.51704, "grad_norm": 2.15625, "grad_norm_var": 3.200935872395833, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 1.9872604608535767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20761053264141083, "step": 25852 }, { "epoch": 0.51708, "grad_norm": 1.921875, "grad_norm_var": 3.206274159749349, "learning_rate": 0.0001, "loss": 3.8383, "loss/crossentropy": 1.6661525964736938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16651833057403564, "step": 25854 }, { "epoch": 0.51712, "grad_norm": 2.03125, "grad_norm_var": 3.218633778889974, "learning_rate": 0.0001, "loss": 4.2274, "loss/crossentropy": 2.043734908103943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980709508061409, "step": 25856 }, { "epoch": 0.51716, "grad_norm": 1.921875, "grad_norm_var": 3.2231727600097657, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 1.897942066192627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18658392876386642, "step": 25858 }, { "epoch": 0.5172, "grad_norm": 1.9609375, "grad_norm_var": 3.216633097330729, "learning_rate": 0.0001, "loss": 4.1149, "loss/crossentropy": 2.1246655583381653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21123212575912476, "step": 25860 }, { "epoch": 0.51724, "grad_norm": 2.03125, "grad_norm_var": 3.201690419514974, "learning_rate": 0.0001, "loss": 4.0062, "loss/crossentropy": 2.000667631626129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007341906428337, "step": 25862 }, { "epoch": 0.51728, "grad_norm": 2.015625, "grad_norm_var": 3.202071126302083, "learning_rate": 0.0001, "loss": 3.987, "loss/crossentropy": 2.09254252910614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19557519257068634, "step": 25864 }, { "epoch": 0.51732, "grad_norm": 1.9140625, "grad_norm_var": 0.012505849202473959, "learning_rate": 0.0001, "loss": 4.1372, "loss/crossentropy": 2.0801188945770264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193548500537872, "step": 25866 }, { "epoch": 0.51736, "grad_norm": 1.8359375, "grad_norm_var": 0.004166412353515625, "learning_rate": 0.0001, "loss": 3.9061, "loss/crossentropy": 2.0977261662483215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003898024559021, "step": 25868 }, { "epoch": 0.5174, "grad_norm": 1.953125, "grad_norm_var": 0.005246734619140625, "learning_rate": 0.0001, "loss": 3.7979, "loss/crossentropy": 1.5647385120391846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14602012932300568, "step": 25870 }, { "epoch": 0.51744, "grad_norm": 1.9921875, "grad_norm_var": 0.0049631754557291664, "learning_rate": 0.0001, "loss": 3.8804, "loss/crossentropy": 1.9746862649917603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19539950788021088, "step": 25872 }, { "epoch": 0.51748, "grad_norm": 1.9375, "grad_norm_var": 0.005721028645833333, "learning_rate": 0.0001, "loss": 4.143, "loss/crossentropy": 2.08547705411911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976427659392357, "step": 25874 }, { "epoch": 0.51752, "grad_norm": 2.0625, "grad_norm_var": 0.007647450764973958, "learning_rate": 0.0001, "loss": 4.017, "loss/crossentropy": 2.040421783924103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20465777814388275, "step": 25876 }, { "epoch": 0.51756, "grad_norm": 1.8359375, "grad_norm_var": 0.008272043863932292, "learning_rate": 0.0001, "loss": 3.7525, "loss/crossentropy": 1.8793463706970215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17017312347888947, "step": 25878 }, { "epoch": 0.5176, "grad_norm": 1.984375, "grad_norm_var": 0.007903798421223959, "learning_rate": 0.0001, "loss": 4.0791, "loss/crossentropy": 2.263822913169861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2253909707069397, "step": 25880 }, { "epoch": 0.51764, "grad_norm": 2.34375, "grad_norm_var": 0.01856689453125, "learning_rate": 0.0001, "loss": 4.2281, "loss/crossentropy": 1.8976882100105286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1699293926358223, "step": 25882 }, { "epoch": 0.51768, "grad_norm": 1.953125, "grad_norm_var": 0.016641998291015626, "learning_rate": 0.0001, "loss": 4.0898, "loss/crossentropy": 2.102656662464142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189642496407032, "step": 25884 }, { "epoch": 0.51772, "grad_norm": 2.28125, "grad_norm_var": 0.0206451416015625, "learning_rate": 0.0001, "loss": 4.0169, "loss/crossentropy": 2.1792502403259277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2173495665192604, "step": 25886 }, { "epoch": 0.51776, "grad_norm": 2.0625, "grad_norm_var": 0.021286773681640624, "learning_rate": 0.0001, "loss": 4.2587, "loss/crossentropy": 2.3405139446258545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20445296168327332, "step": 25888 }, { "epoch": 0.5178, "grad_norm": 1.9296875, "grad_norm_var": 0.0225830078125, "learning_rate": 0.0001, "loss": 3.8529, "loss/crossentropy": 2.116323173046112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.204654723405838, "step": 25890 }, { "epoch": 0.51784, "grad_norm": 1.8515625, "grad_norm_var": 0.021628570556640626, "learning_rate": 0.0001, "loss": 3.9595, "loss/crossentropy": 1.9406288266181946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19154663383960724, "step": 25892 }, { "epoch": 0.51788, "grad_norm": 2.484375, "grad_norm_var": 0.0339263916015625, "learning_rate": 0.0001, "loss": 4.0066, "loss/crossentropy": 2.008155882358551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18465732783079147, "step": 25894 }, { "epoch": 0.51792, "grad_norm": 2.0625, "grad_norm_var": 0.03385594685872396, "learning_rate": 0.0001, "loss": 4.2731, "loss/crossentropy": 2.2472530007362366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22532539069652557, "step": 25896 }, { "epoch": 0.51796, "grad_norm": 1.90625, "grad_norm_var": 0.028238932291666668, "learning_rate": 0.0001, "loss": 3.9856, "loss/crossentropy": 2.0776681900024414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18579304963350296, "step": 25898 }, { "epoch": 0.518, "grad_norm": 1.8984375, "grad_norm_var": 0.029670206705729167, "learning_rate": 0.0001, "loss": 3.8088, "loss/crossentropy": 2.053288221359253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1885019615292549, "step": 25900 }, { "epoch": 0.51804, "grad_norm": 1.84375, "grad_norm_var": 0.02557551066080729, "learning_rate": 0.0001, "loss": 4.0077, "loss/crossentropy": 1.6223698258399963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21323902904987335, "step": 25902 }, { "epoch": 0.51808, "grad_norm": 1.71875, "grad_norm_var": 0.03012873331705729, "learning_rate": 0.0001, "loss": 3.8248, "loss/crossentropy": 2.012439727783203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17938166856765747, "step": 25904 }, { "epoch": 0.51812, "grad_norm": 1.8828125, "grad_norm_var": 0.02967096964518229, "learning_rate": 0.0001, "loss": 4.0141, "loss/crossentropy": 1.975797176361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2021034210920334, "step": 25906 }, { "epoch": 0.51816, "grad_norm": 2.15625, "grad_norm_var": 0.03150126139322917, "learning_rate": 0.0001, "loss": 4.0165, "loss/crossentropy": 2.1858623027801514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22130713611841202, "step": 25908 }, { "epoch": 0.5182, "grad_norm": 1.84375, "grad_norm_var": 0.0142486572265625, "learning_rate": 0.0001, "loss": 3.9008, "loss/crossentropy": 1.8949698209762573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19013772159814835, "step": 25910 }, { "epoch": 0.51824, "grad_norm": 1.9609375, "grad_norm_var": 0.013266754150390626, "learning_rate": 0.0001, "loss": 3.9872, "loss/crossentropy": 1.9657301902770996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19463873654603958, "step": 25912 }, { "epoch": 0.51828, "grad_norm": 1.9609375, "grad_norm_var": 0.0107818603515625, "learning_rate": 0.0001, "loss": 3.9089, "loss/crossentropy": 1.9067611694335938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1767619624733925, "step": 25914 }, { "epoch": 0.51832, "grad_norm": 1.7734375, "grad_norm_var": 0.012032063802083333, "learning_rate": 0.0001, "loss": 3.8141, "loss/crossentropy": 1.9040088653564453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19490251690149307, "step": 25916 }, { "epoch": 0.51836, "grad_norm": 1.6796875, "grad_norm_var": 0.014776357014973958, "learning_rate": 0.0001, "loss": 3.8649, "loss/crossentropy": 1.9055203795433044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863974630832672, "step": 25918 }, { "epoch": 0.5184, "grad_norm": 1.828125, "grad_norm_var": 0.012507883707682292, "learning_rate": 0.0001, "loss": 3.8156, "loss/crossentropy": 2.01656973361969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821809709072113, "step": 25920 }, { "epoch": 0.51844, "grad_norm": 1.859375, "grad_norm_var": 0.012629191080729166, "learning_rate": 0.0001, "loss": 3.9567, "loss/crossentropy": 1.9760606288909912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18721874803304672, "step": 25922 }, { "epoch": 0.51848, "grad_norm": 1.9375, "grad_norm_var": 0.008579254150390625, "learning_rate": 0.0001, "loss": 3.8369, "loss/crossentropy": 2.002072513103485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038237452507019, "step": 25924 }, { "epoch": 0.51852, "grad_norm": 1.8203125, "grad_norm_var": 0.008454386393229167, "learning_rate": 0.0001, "loss": 4.0081, "loss/crossentropy": 2.329070210456848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19514091312885284, "step": 25926 }, { "epoch": 0.51856, "grad_norm": 1.953125, "grad_norm_var": 0.008243815104166666, "learning_rate": 0.0001, "loss": 3.8548, "loss/crossentropy": 2.008804202079773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18007662147283554, "step": 25928 }, { "epoch": 0.5186, "grad_norm": 1.8359375, "grad_norm_var": 0.00882568359375, "learning_rate": 0.0001, "loss": 3.8927, "loss/crossentropy": 1.7008295059204102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17658023536205292, "step": 25930 }, { "epoch": 0.51864, "grad_norm": 1.9296875, "grad_norm_var": 0.009455362955729166, "learning_rate": 0.0001, "loss": 4.2386, "loss/crossentropy": 2.215006470680237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20486676692962646, "step": 25932 }, { "epoch": 0.51868, "grad_norm": 1.890625, "grad_norm_var": 0.006404368082682291, "learning_rate": 0.0001, "loss": 3.6252, "loss/crossentropy": 1.5741249322891235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15962044149637222, "step": 25934 }, { "epoch": 0.51872, "grad_norm": 1.7421875, "grad_norm_var": 0.0077545166015625, "learning_rate": 0.0001, "loss": 3.8169, "loss/crossentropy": 1.8923559784889221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1873473823070526, "step": 25936 }, { "epoch": 0.51876, "grad_norm": 1.9140625, "grad_norm_var": 0.0076067606608072914, "learning_rate": 0.0001, "loss": 3.8542, "loss/crossentropy": 1.7315371632575989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17056572437286377, "step": 25938 }, { "epoch": 0.5188, "grad_norm": 1.9140625, "grad_norm_var": 0.007469685872395834, "learning_rate": 0.0001, "loss": 3.9419, "loss/crossentropy": 1.9019699096679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19772474467754364, "step": 25940 }, { "epoch": 0.51884, "grad_norm": 1.953125, "grad_norm_var": 0.006068674723307291, "learning_rate": 0.0001, "loss": 3.9864, "loss/crossentropy": 2.0523456931114197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19947312027215958, "step": 25942 }, { "epoch": 0.51888, "grad_norm": 2.0, "grad_norm_var": 0.005785115559895833, "learning_rate": 0.0001, "loss": 4.046, "loss/crossentropy": 1.9278483390808105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19414179772138596, "step": 25944 }, { "epoch": 0.51892, "grad_norm": 1.9375, "grad_norm_var": 0.004874674479166666, "learning_rate": 0.0001, "loss": 3.8827, "loss/crossentropy": 2.059605300426483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18402428179979324, "step": 25946 }, { "epoch": 0.51896, "grad_norm": 2.46875, "grad_norm_var": 0.023219553629557292, "learning_rate": 0.0001, "loss": 4.34, "loss/crossentropy": 2.2166699171066284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21071894466876984, "step": 25948 }, { "epoch": 0.519, "grad_norm": 1.9765625, "grad_norm_var": 0.03566665649414062, "learning_rate": 0.0001, "loss": 4.3082, "loss/crossentropy": 1.8669118881225586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2712375670671463, "step": 25950 }, { "epoch": 0.51904, "grad_norm": 1.953125, "grad_norm_var": 0.03169530232747396, "learning_rate": 0.0001, "loss": 4.1025, "loss/crossentropy": 1.9831582307815552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18969787657260895, "step": 25952 }, { "epoch": 0.51908, "grad_norm": 3.421875, "grad_norm_var": 0.15215835571289063, "learning_rate": 0.0001, "loss": 4.2287, "loss/crossentropy": 2.092381715774536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.31603462994098663, "step": 25954 }, { "epoch": 0.51912, "grad_norm": 1.90625, "grad_norm_var": 0.15208231608072917, "learning_rate": 0.0001, "loss": 3.7941, "loss/crossentropy": 2.083071291446686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18124651163816452, "step": 25956 }, { "epoch": 0.51916, "grad_norm": 2.046875, "grad_norm_var": 0.1500017801920573, "learning_rate": 0.0001, "loss": 4.2498, "loss/crossentropy": 2.280423641204834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21027068048715591, "step": 25958 }, { "epoch": 0.5192, "grad_norm": 1.78125, "grad_norm_var": 0.15642471313476564, "learning_rate": 0.0001, "loss": 4.0311, "loss/crossentropy": 2.176509916782379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987079679965973, "step": 25960 }, { "epoch": 0.51924, "grad_norm": 1.7734375, "grad_norm_var": 0.16028416951497396, "learning_rate": 0.0001, "loss": 3.9031, "loss/crossentropy": 1.918852150440216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17551065981388092, "step": 25962 }, { "epoch": 0.51928, "grad_norm": 1.9296875, "grad_norm_var": 0.15350748697916666, "learning_rate": 0.0001, "loss": 3.8734, "loss/crossentropy": 2.238860845565796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2066703960299492, "step": 25964 }, { "epoch": 0.51932, "grad_norm": 2.09375, "grad_norm_var": 0.1447771708170573, "learning_rate": 0.0001, "loss": 4.261, "loss/crossentropy": 2.1192798614501953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21037639677524567, "step": 25966 }, { "epoch": 0.51936, "grad_norm": 1.84375, "grad_norm_var": 0.14727554321289063, "learning_rate": 0.0001, "loss": 3.9908, "loss/crossentropy": 2.0698190927505493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19720114767551422, "step": 25968 }, { "epoch": 0.5194, "grad_norm": 2.15625, "grad_norm_var": 0.0139801025390625, "learning_rate": 0.0001, "loss": 4.3106, "loss/crossentropy": 2.273471713066101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21805588901042938, "step": 25970 }, { "epoch": 0.51944, "grad_norm": 1.8828125, "grad_norm_var": 0.0140869140625, "learning_rate": 0.0001, "loss": 3.9038, "loss/crossentropy": 2.1107550263404846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19864343851804733, "step": 25972 }, { "epoch": 0.51948, "grad_norm": 1.9375, "grad_norm_var": 0.013566080729166667, "learning_rate": 0.0001, "loss": 4.0438, "loss/crossentropy": 1.9644930362701416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18844127655029297, "step": 25974 }, { "epoch": 0.51952, "grad_norm": 1.953125, "grad_norm_var": 0.0092926025390625, "learning_rate": 0.0001, "loss": 4.1062, "loss/crossentropy": 2.120868682861328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19970117509365082, "step": 25976 }, { "epoch": 0.51956, "grad_norm": 1.90625, "grad_norm_var": 0.007201131184895833, "learning_rate": 0.0001, "loss": 3.8065, "loss/crossentropy": 1.6433513164520264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15953880548477173, "step": 25978 }, { "epoch": 0.5196, "grad_norm": 1.9375, "grad_norm_var": 0.009043121337890625, "learning_rate": 0.0001, "loss": 4.1572, "loss/crossentropy": 2.0936105847358704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19382526725530624, "step": 25980 }, { "epoch": 0.51964, "grad_norm": 1.9140625, "grad_norm_var": 0.008234659830729166, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 2.3531426191329956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20965352654457092, "step": 25982 }, { "epoch": 0.51968, "grad_norm": 1.984375, "grad_norm_var": 0.006493123372395834, "learning_rate": 0.0001, "loss": 4.1327, "loss/crossentropy": 2.342974007129669, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20701958984136581, "step": 25984 }, { "epoch": 0.51972, "grad_norm": 2.09375, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 4.3142, "loss/crossentropy": 2.193662643432617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072567343711853, "step": 25986 }, { "epoch": 0.51976, "grad_norm": 1.9375, "grad_norm_var": 0.004809315999348958, "learning_rate": 0.0001, "loss": 4.1056, "loss/crossentropy": 1.8900989294052124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910807117819786, "step": 25988 }, { "epoch": 0.5198, "grad_norm": 1.984375, "grad_norm_var": 0.005387369791666667, "learning_rate": 0.0001, "loss": 4.3945, "loss/crossentropy": 2.243148624897003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20235490798950195, "step": 25990 }, { "epoch": 0.51984, "grad_norm": 1.796875, "grad_norm_var": 0.007513173421223958, "learning_rate": 0.0001, "loss": 3.9088, "loss/crossentropy": 2.105653166770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20262588560581207, "step": 25992 }, { "epoch": 0.51988, "grad_norm": 1.9140625, "grad_norm_var": 0.0073811848958333336, "learning_rate": 0.0001, "loss": 4.0318, "loss/crossentropy": 1.9967975616455078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18423616141080856, "step": 25994 }, { "epoch": 0.51992, "grad_norm": 2.203125, "grad_norm_var": 0.009565989176432291, "learning_rate": 0.0001, "loss": 4.1648, "loss/crossentropy": 2.357354164123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126748189330101, "step": 25996 }, { "epoch": 0.51996, "grad_norm": 1.671875, "grad_norm_var": 0.0246002197265625, "learning_rate": 0.0001, "loss": 3.6427, "loss/crossentropy": 2.22074818611145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187141053378582, "step": 25998 }, { "epoch": 0.52, "grad_norm": 1.984375, "grad_norm_var": 0.0246002197265625, "learning_rate": 0.0001, "loss": 4.1297, "loss/crossentropy": 2.0902404189109802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18575181812047958, "step": 26000 }, { "epoch": 0.52004, "grad_norm": 1.9921875, "grad_norm_var": 0.025172678629557292, "learning_rate": 0.0001, "loss": 4.2075, "loss/crossentropy": 1.9274433851242065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.178068108856678, "step": 26002 }, { "epoch": 0.52008, "grad_norm": 1.8515625, "grad_norm_var": 0.026362864176432292, "learning_rate": 0.0001, "loss": 4.0935, "loss/crossentropy": 2.3539743423461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101089358329773, "step": 26004 }, { "epoch": 0.52012, "grad_norm": 1.9765625, "grad_norm_var": 0.027264149983723958, "learning_rate": 0.0001, "loss": 3.9747, "loss/crossentropy": 1.7968198657035828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17364778369665146, "step": 26006 }, { "epoch": 0.52016, "grad_norm": 1.9296875, "grad_norm_var": 0.025731404622395832, "learning_rate": 0.0001, "loss": 4.0823, "loss/crossentropy": 1.7845246195793152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18205135315656662, "step": 26008 }, { "epoch": 0.5202, "grad_norm": 1.875, "grad_norm_var": 0.027034250895182292, "learning_rate": 0.0001, "loss": 3.8676, "loss/crossentropy": 2.309031844139099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21052244305610657, "step": 26010 }, { "epoch": 0.52024, "grad_norm": 2.0625, "grad_norm_var": 0.024491373697916666, "learning_rate": 0.0001, "loss": 3.9956, "loss/crossentropy": 1.9549171328544617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2348322793841362, "step": 26012 }, { "epoch": 0.52028, "grad_norm": 2.03125, "grad_norm_var": 0.009211222330729166, "learning_rate": 0.0001, "loss": 4.3597, "loss/crossentropy": 2.5377613306045532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23388546705245972, "step": 26014 }, { "epoch": 0.52032, "grad_norm": 2.0625, "grad_norm_var": 0.009912109375, "learning_rate": 0.0001, "loss": 4.244, "loss/crossentropy": 2.1748557090759277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20094042271375656, "step": 26016 }, { "epoch": 0.52036, "grad_norm": 1.921875, "grad_norm_var": 0.007661946614583333, "learning_rate": 0.0001, "loss": 3.8643, "loss/crossentropy": 1.4638828039169312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15480295568704605, "step": 26018 }, { "epoch": 0.5204, "grad_norm": 1.8828125, "grad_norm_var": 0.011408487955729166, "learning_rate": 0.0001, "loss": 4.041, "loss/crossentropy": 2.058153450489044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19334695488214493, "step": 26020 }, { "epoch": 0.52044, "grad_norm": 1.96875, "grad_norm_var": 0.011543782552083333, "learning_rate": 0.0001, "loss": 4.1254, "loss/crossentropy": 2.0739980936050415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19739702343940735, "step": 26022 }, { "epoch": 0.52048, "grad_norm": 1.8984375, "grad_norm_var": 0.0109619140625, "learning_rate": 0.0001, "loss": 4.1202, "loss/crossentropy": 2.0489348769187927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2435540407896042, "step": 26024 }, { "epoch": 0.52052, "grad_norm": 2.0625, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 4.1301, "loss/crossentropy": 1.9589496850967407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1817004606127739, "step": 26026 }, { "epoch": 0.52056, "grad_norm": 2.046875, "grad_norm_var": 0.009032185872395833, "learning_rate": 0.0001, "loss": 4.1931, "loss/crossentropy": 2.2213997840881348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844081044197083, "step": 26028 }, { "epoch": 0.5206, "grad_norm": 1.9609375, "grad_norm_var": 0.010863240559895833, "learning_rate": 0.0001, "loss": 3.9974, "loss/crossentropy": 1.920683741569519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19115915894508362, "step": 26030 }, { "epoch": 0.52064, "grad_norm": 1.84375, "grad_norm_var": 0.012599436442057292, "learning_rate": 0.0001, "loss": 4.011, "loss/crossentropy": 2.142166316509247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18952738493680954, "step": 26032 }, { "epoch": 0.52068, "grad_norm": 2.015625, "grad_norm_var": 0.013067372639973958, "learning_rate": 0.0001, "loss": 4.2289, "loss/crossentropy": 2.1924999952316284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21429342031478882, "step": 26034 }, { "epoch": 0.52072, "grad_norm": 1.890625, "grad_norm_var": 0.010874176025390625, "learning_rate": 0.0001, "loss": 3.7799, "loss/crossentropy": 2.1277804374694824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19686074554920197, "step": 26036 }, { "epoch": 0.52076, "grad_norm": 1.8671875, "grad_norm_var": 0.010959625244140625, "learning_rate": 0.0001, "loss": 3.8865, "loss/crossentropy": 2.2193630933761597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20152346789836884, "step": 26038 }, { "epoch": 0.5208, "grad_norm": 1.921875, "grad_norm_var": 0.010545857747395833, "learning_rate": 0.0001, "loss": 4.1349, "loss/crossentropy": 1.8031319975852966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18107134103775024, "step": 26040 }, { "epoch": 0.52084, "grad_norm": 1.8671875, "grad_norm_var": 0.009032185872395833, "learning_rate": 0.0001, "loss": 4.0376, "loss/crossentropy": 1.9713850021362305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17920061200857162, "step": 26042 }, { "epoch": 0.52088, "grad_norm": 1.796875, "grad_norm_var": 0.00533447265625, "learning_rate": 0.0001, "loss": 4.142, "loss/crossentropy": 2.5888466835021973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081139236688614, "step": 26044 }, { "epoch": 0.52092, "grad_norm": 1.8515625, "grad_norm_var": 0.004811350504557292, "learning_rate": 0.0001, "loss": 4.1217, "loss/crossentropy": 2.4226022958755493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20813634991645813, "step": 26046 }, { "epoch": 0.52096, "grad_norm": 1.8984375, "grad_norm_var": 0.005248006184895833, "learning_rate": 0.0001, "loss": 4.1199, "loss/crossentropy": 2.179656744003296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19288229197263718, "step": 26048 }, { "epoch": 0.521, "grad_norm": 1.8515625, "grad_norm_var": 0.0041463216145833336, "learning_rate": 0.0001, "loss": 4.0571, "loss/crossentropy": 2.014236092567444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1917470395565033, "step": 26050 }, { "epoch": 0.52104, "grad_norm": 1.890625, "grad_norm_var": 0.0035519917805989583, "learning_rate": 0.0001, "loss": 3.7101, "loss/crossentropy": 2.12498939037323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18156899511814117, "step": 26052 }, { "epoch": 0.52108, "grad_norm": 1.875, "grad_norm_var": 0.003342437744140625, "learning_rate": 0.0001, "loss": 3.9443, "loss/crossentropy": 1.8441026210784912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18120328336954117, "step": 26054 }, { "epoch": 0.52112, "grad_norm": 2.109375, "grad_norm_var": 0.008603668212890625, "learning_rate": 0.0001, "loss": 3.9958, "loss/crossentropy": 2.047904908657074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19617904722690582, "step": 26056 }, { "epoch": 0.52116, "grad_norm": 1.8671875, "grad_norm_var": 0.008697255452473959, "learning_rate": 0.0001, "loss": 4.0111, "loss/crossentropy": 1.7961552739143372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16928328573703766, "step": 26058 }, { "epoch": 0.5212, "grad_norm": 1.7890625, "grad_norm_var": 0.008763631184895834, "learning_rate": 0.0001, "loss": 4.0501, "loss/crossentropy": 2.0813616514205933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18402273952960968, "step": 26060 }, { "epoch": 0.52124, "grad_norm": 1.90625, "grad_norm_var": 0.008632151285807292, "learning_rate": 0.0001, "loss": 3.8866, "loss/crossentropy": 2.13175368309021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19714613258838654, "step": 26062 }, { "epoch": 0.52128, "grad_norm": 1.859375, "grad_norm_var": 0.008640289306640625, "learning_rate": 0.0001, "loss": 3.9793, "loss/crossentropy": 2.0052719712257385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20111151784658432, "step": 26064 }, { "epoch": 0.52132, "grad_norm": 2.171875, "grad_norm_var": 0.013216145833333333, "learning_rate": 0.0001, "loss": 4.2311, "loss/crossentropy": 1.939625859260559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20675888657569885, "step": 26066 }, { "epoch": 0.52136, "grad_norm": 1.9140625, "grad_norm_var": 0.013255818684895834, "learning_rate": 0.0001, "loss": 3.7256, "loss/crossentropy": 2.208404779434204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18266795575618744, "step": 26068 }, { "epoch": 0.5214, "grad_norm": 1.8828125, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 3.8233, "loss/crossentropy": 1.8239786028862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18409103155136108, "step": 26070 }, { "epoch": 0.52144, "grad_norm": 1.8125, "grad_norm_var": 0.007743326822916666, "learning_rate": 0.0001, "loss": 4.0984, "loss/crossentropy": 1.951551616191864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17273114621639252, "step": 26072 }, { "epoch": 0.52148, "grad_norm": 1.8671875, "grad_norm_var": 0.007899729410807292, "learning_rate": 0.0001, "loss": 3.6726, "loss/crossentropy": 1.6747522354125977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1755337342619896, "step": 26074 }, { "epoch": 0.52152, "grad_norm": 1.9375, "grad_norm_var": 0.007661946614583333, "learning_rate": 0.0001, "loss": 4.0389, "loss/crossentropy": 1.954121172428131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2060263231396675, "step": 26076 }, { "epoch": 0.52156, "grad_norm": 1.9296875, "grad_norm_var": 0.007757314046223958, "learning_rate": 0.0001, "loss": 4.0388, "loss/crossentropy": 2.025652050971985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19741995632648468, "step": 26078 }, { "epoch": 0.5216, "grad_norm": 2.046875, "grad_norm_var": 0.010249582926432292, "learning_rate": 0.0001, "loss": 4.0048, "loss/crossentropy": 2.150280773639679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016005516052246, "step": 26080 }, { "epoch": 0.52164, "grad_norm": 2.125, "grad_norm_var": 0.009211985270182292, "learning_rate": 0.0001, "loss": 4.0257, "loss/crossentropy": 1.9364630579948425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17605994641780853, "step": 26082 }, { "epoch": 0.52168, "grad_norm": 1.890625, "grad_norm_var": 0.009226226806640625, "learning_rate": 0.0001, "loss": 4.1845, "loss/crossentropy": 2.195487380027771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19364381581544876, "step": 26084 }, { "epoch": 0.52172, "grad_norm": 1.8359375, "grad_norm_var": 0.0098785400390625, "learning_rate": 0.0001, "loss": 3.7514, "loss/crossentropy": 1.6146498918533325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726212576031685, "step": 26086 }, { "epoch": 0.52176, "grad_norm": 1.9140625, "grad_norm_var": 0.009968821207682292, "learning_rate": 0.0001, "loss": 3.8782, "loss/crossentropy": 2.2554049491882324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20600321888923645, "step": 26088 }, { "epoch": 0.5218, "grad_norm": 1.9296875, "grad_norm_var": 0.010210927327473958, "learning_rate": 0.0001, "loss": 3.8424, "loss/crossentropy": 2.0821692943573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20989617705345154, "step": 26090 }, { "epoch": 0.52184, "grad_norm": 2.015625, "grad_norm_var": 0.009765625, "learning_rate": 0.0001, "loss": 4.1895, "loss/crossentropy": 2.114021897315979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20276522636413574, "step": 26092 }, { "epoch": 0.52188, "grad_norm": 1.8671875, "grad_norm_var": 0.009897613525390625, "learning_rate": 0.0001, "loss": 3.9856, "loss/crossentropy": 1.7443894147872925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15722404420375824, "step": 26094 }, { "epoch": 0.52192, "grad_norm": 1.8203125, "grad_norm_var": 0.010796864827473959, "learning_rate": 0.0001, "loss": 3.5758, "loss/crossentropy": 1.603552222251892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14188075065612793, "step": 26096 }, { "epoch": 0.52196, "grad_norm": 1.9453125, "grad_norm_var": 0.0070269266764322914, "learning_rate": 0.0001, "loss": 3.9047, "loss/crossentropy": 1.7003676295280457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1696496084332466, "step": 26098 }, { "epoch": 0.522, "grad_norm": 1.8671875, "grad_norm_var": 0.0069964090983072914, "learning_rate": 0.0001, "loss": 3.9792, "loss/crossentropy": 2.1242096424102783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936848685145378, "step": 26100 }, { "epoch": 0.52204, "grad_norm": 1.9765625, "grad_norm_var": 0.0076416015625, "learning_rate": 0.0001, "loss": 3.9315, "loss/crossentropy": 2.012674570083618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068016678094864, "step": 26102 }, { "epoch": 0.52208, "grad_norm": 2.203125, "grad_norm_var": 0.014304351806640626, "learning_rate": 0.0001, "loss": 4.0618, "loss/crossentropy": 2.0136077404022217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968667060136795, "step": 26104 }, { "epoch": 0.52212, "grad_norm": 1.8203125, "grad_norm_var": 0.0141021728515625, "learning_rate": 0.0001, "loss": 3.9797, "loss/crossentropy": 1.8880528211593628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18080192804336548, "step": 26106 }, { "epoch": 0.52216, "grad_norm": 2.015625, "grad_norm_var": 0.014216105143229166, "learning_rate": 0.0001, "loss": 4.0981, "loss/crossentropy": 2.109636068344116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19920819997787476, "step": 26108 }, { "epoch": 0.5222, "grad_norm": 1.8984375, "grad_norm_var": 0.015236155192057291, "learning_rate": 0.0001, "loss": 4.0566, "loss/crossentropy": 1.8923597931861877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18053779006004333, "step": 26110 }, { "epoch": 0.52224, "grad_norm": 1.8828125, "grad_norm_var": 0.011324055989583333, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 2.179795265197754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18849624693393707, "step": 26112 }, { "epoch": 0.52228, "grad_norm": 1.9765625, "grad_norm_var": 0.011789703369140625, "learning_rate": 0.0001, "loss": 3.9964, "loss/crossentropy": 2.0764212608337402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18905582278966904, "step": 26114 }, { "epoch": 0.52232, "grad_norm": 2.0625, "grad_norm_var": 0.012634023030598959, "learning_rate": 0.0001, "loss": 4.0006, "loss/crossentropy": 2.2309343814849854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156553491950035, "step": 26116 }, { "epoch": 0.52236, "grad_norm": 1.78125, "grad_norm_var": 0.012261708577473959, "learning_rate": 0.0001, "loss": 3.8689, "loss/crossentropy": 1.8764967918395996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17533813416957855, "step": 26118 }, { "epoch": 0.5224, "grad_norm": 1.9140625, "grad_norm_var": 0.0075266520182291664, "learning_rate": 0.0001, "loss": 3.916, "loss/crossentropy": 2.1870853900909424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984037682414055, "step": 26120 }, { "epoch": 0.52244, "grad_norm": 1.828125, "grad_norm_var": 0.007838694254557292, "learning_rate": 0.0001, "loss": 3.716, "loss/crossentropy": 2.113897442817688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18819265812635422, "step": 26122 }, { "epoch": 0.52248, "grad_norm": 2.0625, "grad_norm_var": 0.008056386311848959, "learning_rate": 0.0001, "loss": 4.1003, "loss/crossentropy": 1.9980355501174927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20408682525157928, "step": 26124 }, { "epoch": 0.52252, "grad_norm": 2.015625, "grad_norm_var": 0.007784016927083333, "learning_rate": 0.0001, "loss": 4.0452, "loss/crossentropy": 2.0322210788726807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19906745105981827, "step": 26126 }, { "epoch": 0.52256, "grad_norm": 1.8671875, "grad_norm_var": 0.008304595947265625, "learning_rate": 0.0001, "loss": 3.9786, "loss/crossentropy": 2.0122641921043396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2028798833489418, "step": 26128 }, { "epoch": 0.5226, "grad_norm": 1.9296875, "grad_norm_var": 0.007352447509765625, "learning_rate": 0.0001, "loss": 4.0595, "loss/crossentropy": 2.0169489979743958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20758477598428726, "step": 26130 }, { "epoch": 0.52264, "grad_norm": 2.03125, "grad_norm_var": 0.006754302978515625, "learning_rate": 0.0001, "loss": 4.2196, "loss/crossentropy": 2.45787513256073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2269362136721611, "step": 26132 }, { "epoch": 0.52268, "grad_norm": 1.8203125, "grad_norm_var": 0.0064198811848958336, "learning_rate": 0.0001, "loss": 3.9921, "loss/crossentropy": 2.0325412154197693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20839164406061172, "step": 26134 }, { "epoch": 0.52272, "grad_norm": 2.046875, "grad_norm_var": 0.006815338134765625, "learning_rate": 0.0001, "loss": 4.1983, "loss/crossentropy": 2.1984806060791016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21555981040000916, "step": 26136 }, { "epoch": 0.52276, "grad_norm": 2.03125, "grad_norm_var": 0.005440266927083334, "learning_rate": 0.0001, "loss": 3.9955, "loss/crossentropy": 1.9628766775131226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2089555338025093, "step": 26138 }, { "epoch": 0.5228, "grad_norm": 1.8671875, "grad_norm_var": 0.0069882710774739586, "learning_rate": 0.0001, "loss": 3.9137, "loss/crossentropy": 2.1061203479766846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18837912380695343, "step": 26140 }, { "epoch": 0.52284, "grad_norm": 1.8984375, "grad_norm_var": 0.014485677083333334, "learning_rate": 0.0001, "loss": 3.6926, "loss/crossentropy": 2.120006561279297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19504809379577637, "step": 26142 }, { "epoch": 0.52288, "grad_norm": 1.859375, "grad_norm_var": 0.013749186197916667, "learning_rate": 0.0001, "loss": 3.8771, "loss/crossentropy": 2.0000118613243103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19644330441951752, "step": 26144 }, { "epoch": 0.52292, "grad_norm": 1.9296875, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 4.2381, "loss/crossentropy": 2.261639356613159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21112587302923203, "step": 26146 }, { "epoch": 0.52296, "grad_norm": 1.9453125, "grad_norm_var": 0.013392893473307292, "learning_rate": 0.0001, "loss": 4.0808, "loss/crossentropy": 2.203276216983795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19632557779550552, "step": 26148 }, { "epoch": 0.523, "grad_norm": 1.953125, "grad_norm_var": 0.0129302978515625, "learning_rate": 0.0001, "loss": 4.1363, "loss/crossentropy": 2.1430857181549072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18741771578788757, "step": 26150 }, { "epoch": 0.52304, "grad_norm": 1.828125, "grad_norm_var": 0.012581380208333333, "learning_rate": 0.0001, "loss": 3.9574, "loss/crossentropy": 2.094003438949585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19085614383220673, "step": 26152 }, { "epoch": 0.52308, "grad_norm": 1.9140625, "grad_norm_var": 0.011800130208333334, "learning_rate": 0.0001, "loss": 3.9145, "loss/crossentropy": 1.9885526299476624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17914890497922897, "step": 26154 }, { "epoch": 0.52312, "grad_norm": 1.7734375, "grad_norm_var": 0.012263743082682292, "learning_rate": 0.0001, "loss": 3.9942, "loss/crossentropy": 1.885224461555481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1779332235455513, "step": 26156 }, { "epoch": 0.52316, "grad_norm": 1.8671875, "grad_norm_var": 0.007411448160807291, "learning_rate": 0.0001, "loss": 3.9081, "loss/crossentropy": 1.8236491084098816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17156238853931427, "step": 26158 }, { "epoch": 0.5232, "grad_norm": 1.8671875, "grad_norm_var": 0.008685048421223958, "learning_rate": 0.0001, "loss": 3.8614, "loss/crossentropy": 1.7451134324073792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16748512536287308, "step": 26160 }, { "epoch": 0.52324, "grad_norm": 2.046875, "grad_norm_var": 0.00872802734375, "learning_rate": 0.0001, "loss": 4.0804, "loss/crossentropy": 2.408850908279419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20247718691825867, "step": 26162 }, { "epoch": 0.52328, "grad_norm": 1.9453125, "grad_norm_var": 0.0099273681640625, "learning_rate": 0.0001, "loss": 4.0908, "loss/crossentropy": 2.265584349632263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20462538301944733, "step": 26164 }, { "epoch": 0.52332, "grad_norm": 1.8046875, "grad_norm_var": 0.008676910400390625, "learning_rate": 0.0001, "loss": 3.7925, "loss/crossentropy": 1.8766103982925415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1740010604262352, "step": 26166 }, { "epoch": 0.52336, "grad_norm": 2.046875, "grad_norm_var": 0.009413401285807291, "learning_rate": 0.0001, "loss": 3.9389, "loss/crossentropy": 1.858881413936615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.176437109708786, "step": 26168 }, { "epoch": 0.5234, "grad_norm": 1.96875, "grad_norm_var": 0.008907063802083334, "learning_rate": 0.0001, "loss": 3.9833, "loss/crossentropy": 1.733948528766632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.183179572224617, "step": 26170 }, { "epoch": 0.52344, "grad_norm": 2.34375, "grad_norm_var": 0.017923990885416668, "learning_rate": 0.0001, "loss": 4.4352, "loss/crossentropy": 1.7623996138572693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3073955178260803, "step": 26172 }, { "epoch": 0.52348, "grad_norm": 1.921875, "grad_norm_var": 0.017470041910807293, "learning_rate": 0.0001, "loss": 4.0571, "loss/crossentropy": 1.9229013919830322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19095296412706375, "step": 26174 }, { "epoch": 0.52352, "grad_norm": 1.84375, "grad_norm_var": 0.014867146809895834, "learning_rate": 0.0001, "loss": 4.1139, "loss/crossentropy": 2.155863106250763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953117996454239, "step": 26176 }, { "epoch": 0.52356, "grad_norm": 1.8671875, "grad_norm_var": 0.01754735310872396, "learning_rate": 0.0001, "loss": 3.8574, "loss/crossentropy": 1.8662053942680359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17596308141946793, "step": 26178 }, { "epoch": 0.5236, "grad_norm": 2.4375, "grad_norm_var": 0.03288548787434896, "learning_rate": 0.0001, "loss": 4.2924, "loss/crossentropy": 2.0425168871879578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088511660695076, "step": 26180 }, { "epoch": 0.52364, "grad_norm": 1.875, "grad_norm_var": 0.031689453125, "learning_rate": 0.0001, "loss": 4.0537, "loss/crossentropy": 2.3064836263656616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20652327686548233, "step": 26182 }, { "epoch": 0.52368, "grad_norm": 2.171875, "grad_norm_var": 0.035065714518229166, "learning_rate": 0.0001, "loss": 4.2602, "loss/crossentropy": 2.437976837158203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21545469760894775, "step": 26184 }, { "epoch": 0.52372, "grad_norm": 2.03125, "grad_norm_var": 0.034645334879557295, "learning_rate": 0.0001, "loss": 3.8111, "loss/crossentropy": 1.8655226826667786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19221200793981552, "step": 26186 }, { "epoch": 0.52376, "grad_norm": 1.8203125, "grad_norm_var": 0.027339426676432292, "learning_rate": 0.0001, "loss": 3.9068, "loss/crossentropy": 1.5981062650680542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15811648964881897, "step": 26188 }, { "epoch": 0.5238, "grad_norm": 1.890625, "grad_norm_var": 0.02739232381184896, "learning_rate": 0.0001, "loss": 3.6361, "loss/crossentropy": 2.0123122334480286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117129176855087, "step": 26190 }, { "epoch": 0.52384, "grad_norm": 1.921875, "grad_norm_var": 0.02658869425455729, "learning_rate": 0.0001, "loss": 4.2715, "loss/crossentropy": 1.8640416860580444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17403434216976166, "step": 26192 }, { "epoch": 0.52388, "grad_norm": 1.9375, "grad_norm_var": 0.026683553059895834, "learning_rate": 0.0001, "loss": 3.9331, "loss/crossentropy": 1.9721081256866455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18515276163816452, "step": 26194 }, { "epoch": 0.52392, "grad_norm": 1.8984375, "grad_norm_var": 0.0124176025390625, "learning_rate": 0.0001, "loss": 3.9945, "loss/crossentropy": 1.9967116713523865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1789851263165474, "step": 26196 }, { "epoch": 0.52396, "grad_norm": 2.015625, "grad_norm_var": 0.012467193603515624, "learning_rate": 0.0001, "loss": 4.0541, "loss/crossentropy": 1.9825578331947327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994011253118515, "step": 26198 }, { "epoch": 0.524, "grad_norm": 1.7734375, "grad_norm_var": 0.011922200520833334, "learning_rate": 0.0001, "loss": 3.8301, "loss/crossentropy": 1.9351627230644226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18274720758199692, "step": 26200 }, { "epoch": 0.52404, "grad_norm": 1.8515625, "grad_norm_var": 0.011970011393229167, "learning_rate": 0.0001, "loss": 3.7656, "loss/crossentropy": 1.9799736738204956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18502405285835266, "step": 26202 }, { "epoch": 0.52408, "grad_norm": 1.828125, "grad_norm_var": 0.011896769205729166, "learning_rate": 0.0001, "loss": 3.8567, "loss/crossentropy": 1.6783319115638733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14768389612436295, "step": 26204 }, { "epoch": 0.52412, "grad_norm": 2.03125, "grad_norm_var": 0.022709147135416666, "learning_rate": 0.0001, "loss": 3.8827, "loss/crossentropy": 2.167922616004944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2303374707698822, "step": 26206 }, { "epoch": 0.52416, "grad_norm": 1.9453125, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 3.9447, "loss/crossentropy": 2.075382351875305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19902942329645157, "step": 26208 }, { "epoch": 0.5242, "grad_norm": 1.6796875, "grad_norm_var": 0.021203358968098957, "learning_rate": 0.0001, "loss": 3.7008, "loss/crossentropy": 1.9230342507362366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18066919595003128, "step": 26210 }, { "epoch": 0.52424, "grad_norm": 1.8671875, "grad_norm_var": 0.021134440104166666, "learning_rate": 0.0001, "loss": 3.7475, "loss/crossentropy": 2.000435531139374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19018910825252533, "step": 26212 }, { "epoch": 0.52428, "grad_norm": 2.1875, "grad_norm_var": 0.025569407145182292, "learning_rate": 0.0001, "loss": 4.1926, "loss/crossentropy": 1.887748658657074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17914170771837234, "step": 26214 }, { "epoch": 0.52432, "grad_norm": 1.9296875, "grad_norm_var": 0.023911285400390624, "learning_rate": 0.0001, "loss": 3.8331, "loss/crossentropy": 1.9090191721916199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18189192563295364, "step": 26216 }, { "epoch": 0.52436, "grad_norm": 1.8671875, "grad_norm_var": 0.023331705729166666, "learning_rate": 0.0001, "loss": 3.9697, "loss/crossentropy": 1.9368041157722473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18426424264907837, "step": 26218 }, { "epoch": 0.5244, "grad_norm": 1.875, "grad_norm_var": 0.022761027018229168, "learning_rate": 0.0001, "loss": 3.7137, "loss/crossentropy": 2.1554033160209656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20169231295585632, "step": 26220 }, { "epoch": 0.52444, "grad_norm": 1.84375, "grad_norm_var": 0.011848704020182291, "learning_rate": 0.0001, "loss": 3.78, "loss/crossentropy": 2.013387084007263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1921246349811554, "step": 26222 }, { "epoch": 0.52448, "grad_norm": 1.9453125, "grad_norm_var": 0.011848704020182291, "learning_rate": 0.0001, "loss": 4.045, "loss/crossentropy": 2.166366934776306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19481656700372696, "step": 26224 }, { "epoch": 0.52452, "grad_norm": 2.171875, "grad_norm_var": 0.012669881184895834, "learning_rate": 0.0001, "loss": 3.9517, "loss/crossentropy": 2.15239155292511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20820968598127365, "step": 26226 }, { "epoch": 0.52456, "grad_norm": 1.8828125, "grad_norm_var": 0.014229329427083333, "learning_rate": 0.0001, "loss": 3.9075, "loss/crossentropy": 2.3724613189697266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21621037274599075, "step": 26228 }, { "epoch": 0.5246, "grad_norm": 1.890625, "grad_norm_var": 0.008888498942057291, "learning_rate": 0.0001, "loss": 3.9089, "loss/crossentropy": 2.34514319896698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21700077503919601, "step": 26230 }, { "epoch": 0.52464, "grad_norm": 1.7890625, "grad_norm_var": 0.009085845947265626, "learning_rate": 0.0001, "loss": 3.8283, "loss/crossentropy": 1.7633844017982483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17453230917453766, "step": 26232 }, { "epoch": 0.52468, "grad_norm": 1.984375, "grad_norm_var": 0.010135650634765625, "learning_rate": 0.0001, "loss": 3.6374, "loss/crossentropy": 1.952212929725647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18606504052877426, "step": 26234 }, { "epoch": 0.52472, "grad_norm": 2.109375, "grad_norm_var": 0.012851715087890625, "learning_rate": 0.0001, "loss": 4.4356, "loss/crossentropy": 2.2077181935310364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19942647963762283, "step": 26236 }, { "epoch": 0.52476, "grad_norm": 2.046875, "grad_norm_var": 0.014029947916666667, "learning_rate": 0.0001, "loss": 4.2066, "loss/crossentropy": 2.110091209411621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050892785191536, "step": 26238 }, { "epoch": 0.5248, "grad_norm": 1.9921875, "grad_norm_var": 0.014284006754557292, "learning_rate": 0.0001, "loss": 3.9062, "loss/crossentropy": 2.2912864685058594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043801248073578, "step": 26240 }, { "epoch": 0.52484, "grad_norm": 1.84375, "grad_norm_var": 0.010396067301432292, "learning_rate": 0.0001, "loss": 3.9453, "loss/crossentropy": 2.280913293361664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21790295839309692, "step": 26242 }, { "epoch": 0.52488, "grad_norm": 1.875, "grad_norm_var": 0.008568318684895833, "learning_rate": 0.0001, "loss": 3.9073, "loss/crossentropy": 1.9681018590927124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18275773525238037, "step": 26244 }, { "epoch": 0.52492, "grad_norm": 1.9453125, "grad_norm_var": 0.009073893229166666, "learning_rate": 0.0001, "loss": 4.0666, "loss/crossentropy": 2.1007355451583862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20663000643253326, "step": 26246 }, { "epoch": 0.52496, "grad_norm": 1.796875, "grad_norm_var": 0.009029134114583334, "learning_rate": 0.0001, "loss": 4.0375, "loss/crossentropy": 2.1250157952308655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20055226981639862, "step": 26248 }, { "epoch": 0.525, "grad_norm": 1.8359375, "grad_norm_var": 0.008451334635416667, "learning_rate": 0.0001, "loss": 3.8399, "loss/crossentropy": 1.8575809597969055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18729300796985626, "step": 26250 }, { "epoch": 0.52504, "grad_norm": 1.8984375, "grad_norm_var": 0.006150054931640625, "learning_rate": 0.0001, "loss": 4.0213, "loss/crossentropy": 2.172927498817444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20007719844579697, "step": 26252 }, { "epoch": 0.52508, "grad_norm": 1.765625, "grad_norm_var": 0.005271148681640625, "learning_rate": 0.0001, "loss": 3.8717, "loss/crossentropy": 2.200989246368408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2174500823020935, "step": 26254 }, { "epoch": 0.52512, "grad_norm": 1.890625, "grad_norm_var": 0.004835764567057292, "learning_rate": 0.0001, "loss": 4.051, "loss/crossentropy": 2.1007824540138245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20543181151151657, "step": 26256 }, { "epoch": 0.52516, "grad_norm": 1.921875, "grad_norm_var": 0.3579241434733073, "learning_rate": 0.0001, "loss": 3.9813, "loss/crossentropy": 1.8371055722236633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18277855217456818, "step": 26258 }, { "epoch": 0.5252, "grad_norm": 1.9765625, "grad_norm_var": 0.3564849853515625, "learning_rate": 0.0001, "loss": 3.8955, "loss/crossentropy": 1.9994609355926514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19775213301181793, "step": 26260 }, { "epoch": 0.52524, "grad_norm": 2.015625, "grad_norm_var": 0.3601519266764323, "learning_rate": 0.0001, "loss": 3.8842, "loss/crossentropy": 1.9619916677474976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894378885626793, "step": 26262 }, { "epoch": 0.52528, "grad_norm": 1.9375, "grad_norm_var": 0.3581438700358073, "learning_rate": 0.0001, "loss": 4.0634, "loss/crossentropy": 1.7938214540481567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18398133665323257, "step": 26264 }, { "epoch": 0.52532, "grad_norm": 2.109375, "grad_norm_var": 0.3526995340983073, "learning_rate": 0.0001, "loss": 4.1002, "loss/crossentropy": 2.0497565865516663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19529610127210617, "step": 26266 }, { "epoch": 0.52536, "grad_norm": 1.8046875, "grad_norm_var": 0.3564715067545573, "learning_rate": 0.0001, "loss": 3.8681, "loss/crossentropy": 2.2085641622543335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090958058834076, "step": 26268 }, { "epoch": 0.5254, "grad_norm": 1.953125, "grad_norm_var": 0.3512034098307292, "learning_rate": 0.0001, "loss": 4.1566, "loss/crossentropy": 2.143643379211426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305573731660843, "step": 26270 }, { "epoch": 0.52544, "grad_norm": 2.34375, "grad_norm_var": 0.35494969685872396, "learning_rate": 0.0001, "loss": 4.1428, "loss/crossentropy": 2.2753764390945435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21875588595867157, "step": 26272 }, { "epoch": 0.52548, "grad_norm": 2.0, "grad_norm_var": 0.09860610961914062, "learning_rate": 0.0001, "loss": 4.1114, "loss/crossentropy": 2.0325489044189453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19430440664291382, "step": 26274 }, { "epoch": 0.52552, "grad_norm": 1.984375, "grad_norm_var": 0.0970293680826823, "learning_rate": 0.0001, "loss": 3.9278, "loss/crossentropy": 2.3086185455322266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21185901761054993, "step": 26276 }, { "epoch": 0.52556, "grad_norm": 2.03125, "grad_norm_var": 0.0931060791015625, "learning_rate": 0.0001, "loss": 4.1219, "loss/crossentropy": 2.1545599699020386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019871026277542, "step": 26278 }, { "epoch": 0.5256, "grad_norm": 2.03125, "grad_norm_var": 0.08904622395833334, "learning_rate": 0.0001, "loss": 4.0154, "loss/crossentropy": 1.8528248071670532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17894673347473145, "step": 26280 }, { "epoch": 0.52564, "grad_norm": 1.796875, "grad_norm_var": 0.09781901041666667, "learning_rate": 0.0001, "loss": 3.6732, "loss/crossentropy": 1.7371985912322998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17898088693618774, "step": 26282 }, { "epoch": 0.52568, "grad_norm": 1.78125, "grad_norm_var": 0.10242284138997396, "learning_rate": 0.0001, "loss": 3.7005, "loss/crossentropy": 1.8484544157981873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16372547298669815, "step": 26284 }, { "epoch": 0.52572, "grad_norm": 1.859375, "grad_norm_var": 0.10460586547851562, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 2.2279897928237915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19722852110862732, "step": 26286 }, { "epoch": 0.52576, "grad_norm": 2.046875, "grad_norm_var": 0.09019749959309896, "learning_rate": 0.0001, "loss": 4.0875, "loss/crossentropy": 1.9138891100883484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17364701628684998, "step": 26288 }, { "epoch": 0.5258, "grad_norm": 1.921875, "grad_norm_var": 0.010786946614583333, "learning_rate": 0.0001, "loss": 3.8941, "loss/crossentropy": 2.062381386756897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20237115025520325, "step": 26290 }, { "epoch": 0.52584, "grad_norm": 1.9765625, "grad_norm_var": 0.010548655192057292, "learning_rate": 0.0001, "loss": 4.0351, "loss/crossentropy": 2.094722032546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1814640611410141, "step": 26292 }, { "epoch": 0.52588, "grad_norm": 2.015625, "grad_norm_var": 0.010334269205729166, "learning_rate": 0.0001, "loss": 4.2114, "loss/crossentropy": 1.6870167255401611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19745031744241714, "step": 26294 }, { "epoch": 0.52592, "grad_norm": 1.9375, "grad_norm_var": 0.007062784830729167, "learning_rate": 0.0001, "loss": 3.7982, "loss/crossentropy": 1.688876748085022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17806339263916016, "step": 26296 }, { "epoch": 0.52596, "grad_norm": 1.8125, "grad_norm_var": 0.006898752848307292, "learning_rate": 0.0001, "loss": 3.8256, "loss/crossentropy": 2.0577162504196167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18965084105730057, "step": 26298 }, { "epoch": 0.526, "grad_norm": 1.921875, "grad_norm_var": 0.005057779947916666, "learning_rate": 0.0001, "loss": 3.8858, "loss/crossentropy": 1.9294962882995605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19645613431930542, "step": 26300 }, { "epoch": 0.52604, "grad_norm": 2.0625, "grad_norm_var": 0.0063435872395833336, "learning_rate": 0.0001, "loss": 3.9, "loss/crossentropy": 2.1160236597061157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20182175934314728, "step": 26302 }, { "epoch": 0.52608, "grad_norm": 1.8984375, "grad_norm_var": 0.005232747395833333, "learning_rate": 0.0001, "loss": 3.9438, "loss/crossentropy": 2.0521216988563538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19732891023159027, "step": 26304 }, { "epoch": 0.52612, "grad_norm": 1.9140625, "grad_norm_var": 0.004789225260416667, "learning_rate": 0.0001, "loss": 4.1944, "loss/crossentropy": 2.3100136518478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355679094791412, "step": 26306 }, { "epoch": 0.52616, "grad_norm": 2.109375, "grad_norm_var": 0.0065081278483072914, "learning_rate": 0.0001, "loss": 4.1077, "loss/crossentropy": 2.2635756731033325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21735866367816925, "step": 26308 }, { "epoch": 0.5262, "grad_norm": 1.828125, "grad_norm_var": 0.008512115478515625, "learning_rate": 0.0001, "loss": 4.0985, "loss/crossentropy": 1.8445220589637756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18094825744628906, "step": 26310 }, { "epoch": 0.52624, "grad_norm": 1.9296875, "grad_norm_var": 0.007651519775390625, "learning_rate": 0.0001, "loss": 3.9866, "loss/crossentropy": 2.174062728881836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19913697242736816, "step": 26312 }, { "epoch": 0.52628, "grad_norm": 1.953125, "grad_norm_var": 0.008565266927083334, "learning_rate": 0.0001, "loss": 3.8556, "loss/crossentropy": 2.1039488911628723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815500482916832, "step": 26314 }, { "epoch": 0.52632, "grad_norm": 1.9609375, "grad_norm_var": 0.008491770426432291, "learning_rate": 0.0001, "loss": 4.3284, "loss/crossentropy": 2.134014844894409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855064690113068, "step": 26316 }, { "epoch": 0.52636, "grad_norm": 1.875, "grad_norm_var": 0.008050282796223959, "learning_rate": 0.0001, "loss": 3.9977, "loss/crossentropy": 1.7635084390640259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16970381140708923, "step": 26318 }, { "epoch": 0.5264, "grad_norm": 1.921875, "grad_norm_var": 0.0078122456868489586, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 2.4526535272598267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066413402557373, "step": 26320 }, { "epoch": 0.52644, "grad_norm": 2.046875, "grad_norm_var": 0.008210245768229167, "learning_rate": 0.0001, "loss": 4.1903, "loss/crossentropy": 2.073263466358185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19519560784101486, "step": 26322 }, { "epoch": 0.52648, "grad_norm": 2.0625, "grad_norm_var": 0.007401529947916667, "learning_rate": 0.0001, "loss": 3.9606, "loss/crossentropy": 1.9899149537086487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18811341375112534, "step": 26324 }, { "epoch": 0.52652, "grad_norm": 2.203125, "grad_norm_var": 0.013899739583333333, "learning_rate": 0.0001, "loss": 3.9753, "loss/crossentropy": 2.039364755153656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18913507461547852, "step": 26326 }, { "epoch": 0.52656, "grad_norm": 1.9609375, "grad_norm_var": 0.014096832275390625, "learning_rate": 0.0001, "loss": 3.9944, "loss/crossentropy": 2.076085150241852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21231256425380707, "step": 26328 }, { "epoch": 0.5266, "grad_norm": 1.7734375, "grad_norm_var": 0.016331990559895832, "learning_rate": 0.0001, "loss": 3.8768, "loss/crossentropy": 2.1222329139709473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18735431134700775, "step": 26330 }, { "epoch": 0.52664, "grad_norm": 1.8984375, "grad_norm_var": 0.01644872029622396, "learning_rate": 0.0001, "loss": 4.206, "loss/crossentropy": 2.490665316581726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19682377576828003, "step": 26332 }, { "epoch": 0.52668, "grad_norm": 1.7890625, "grad_norm_var": 0.01778132120768229, "learning_rate": 0.0001, "loss": 4.0541, "loss/crossentropy": 2.2802486419677734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20574340224266052, "step": 26334 }, { "epoch": 0.52672, "grad_norm": 1.8359375, "grad_norm_var": 0.019925944010416665, "learning_rate": 0.0001, "loss": 3.9517, "loss/crossentropy": 2.094264805316925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18822843581438065, "step": 26336 }, { "epoch": 0.52676, "grad_norm": 2.0, "grad_norm_var": 0.020151519775390626, "learning_rate": 0.0001, "loss": 3.7813, "loss/crossentropy": 1.9270477294921875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1901463121175766, "step": 26338 }, { "epoch": 0.5268, "grad_norm": 1.8671875, "grad_norm_var": 0.018046061197916668, "learning_rate": 0.0001, "loss": 4.1199, "loss/crossentropy": 2.1571450233459473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919422224164009, "step": 26340 }, { "epoch": 0.52684, "grad_norm": 1.875, "grad_norm_var": 0.008902994791666667, "learning_rate": 0.0001, "loss": 3.9607, "loss/crossentropy": 1.9483349323272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188851498067379, "step": 26342 }, { "epoch": 0.52688, "grad_norm": 1.984375, "grad_norm_var": 0.009401194254557292, "learning_rate": 0.0001, "loss": 3.9986, "loss/crossentropy": 2.0212563276290894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19506627321243286, "step": 26344 }, { "epoch": 0.52692, "grad_norm": 1.953125, "grad_norm_var": 0.004689280192057292, "learning_rate": 0.0001, "loss": 3.9598, "loss/crossentropy": 1.9661552906036377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18422409892082214, "step": 26346 }, { "epoch": 0.52696, "grad_norm": 1.953125, "grad_norm_var": 0.0053212483723958336, "learning_rate": 0.0001, "loss": 3.851, "loss/crossentropy": 2.0267988443374634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18864717334508896, "step": 26348 }, { "epoch": 0.527, "grad_norm": 1.7734375, "grad_norm_var": 0.006468709309895833, "learning_rate": 0.0001, "loss": 3.8491, "loss/crossentropy": 1.874852180480957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17451918125152588, "step": 26350 }, { "epoch": 0.52704, "grad_norm": 1.9453125, "grad_norm_var": 0.0056722005208333336, "learning_rate": 0.0001, "loss": 3.8164, "loss/crossentropy": 1.9450770020484924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19308660924434662, "step": 26352 }, { "epoch": 0.52708, "grad_norm": 1.9765625, "grad_norm_var": 0.004589589436848959, "learning_rate": 0.0001, "loss": 3.9731, "loss/crossentropy": 1.8964659571647644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18824025243520737, "step": 26354 }, { "epoch": 0.52712, "grad_norm": 1.921875, "grad_norm_var": 0.0045166015625, "learning_rate": 0.0001, "loss": 3.9693, "loss/crossentropy": 1.856788158416748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17411265522241592, "step": 26356 }, { "epoch": 0.52716, "grad_norm": 1.984375, "grad_norm_var": 0.005098470052083333, "learning_rate": 0.0001, "loss": 3.963, "loss/crossentropy": 1.6551550030708313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1649954915046692, "step": 26358 }, { "epoch": 0.5272, "grad_norm": 1.9375, "grad_norm_var": 0.005791982014973958, "learning_rate": 0.0001, "loss": 3.7678, "loss/crossentropy": 1.9796473383903503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766190305352211, "step": 26360 }, { "epoch": 0.52724, "grad_norm": 1.875, "grad_norm_var": 0.0056955973307291664, "learning_rate": 0.0001, "loss": 3.9551, "loss/crossentropy": 1.9023584723472595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18641258776187897, "step": 26362 }, { "epoch": 0.52728, "grad_norm": 1.9453125, "grad_norm_var": 0.005069732666015625, "learning_rate": 0.0001, "loss": 4.1641, "loss/crossentropy": 2.0332838892936707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18603450059890747, "step": 26364 }, { "epoch": 0.52732, "grad_norm": 1.8515625, "grad_norm_var": 0.003580474853515625, "learning_rate": 0.0001, "loss": 3.8799, "loss/crossentropy": 2.0670453310012817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1904071420431137, "step": 26366 }, { "epoch": 0.52736, "grad_norm": 1.84375, "grad_norm_var": 0.004162343343098959, "learning_rate": 0.0001, "loss": 3.7713, "loss/crossentropy": 1.9684784412384033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19187042117118835, "step": 26368 }, { "epoch": 0.5274, "grad_norm": 1.859375, "grad_norm_var": 0.003979237874348959, "learning_rate": 0.0001, "loss": 4.0018, "loss/crossentropy": 1.9178830981254578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805613711476326, "step": 26370 }, { "epoch": 0.52744, "grad_norm": 2.046875, "grad_norm_var": 0.005496978759765625, "learning_rate": 0.0001, "loss": 4.1603, "loss/crossentropy": 2.094612956047058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2008626013994217, "step": 26372 }, { "epoch": 0.52748, "grad_norm": 1.9453125, "grad_norm_var": 0.005102284749348958, "learning_rate": 0.0001, "loss": 3.8223, "loss/crossentropy": 2.0617056488990784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19100335985422134, "step": 26374 }, { "epoch": 0.52752, "grad_norm": 1.8984375, "grad_norm_var": 0.00411376953125, "learning_rate": 0.0001, "loss": 3.8758, "loss/crossentropy": 2.1675769090652466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1890057846903801, "step": 26376 }, { "epoch": 0.52756, "grad_norm": 2.0625, "grad_norm_var": 0.005600738525390625, "learning_rate": 0.0001, "loss": 4.0096, "loss/crossentropy": 2.245623469352722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736746490001678, "step": 26378 }, { "epoch": 0.5276, "grad_norm": 1.9296875, "grad_norm_var": 0.0055735270182291664, "learning_rate": 0.0001, "loss": 4.1111, "loss/crossentropy": 2.107472836971283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996273696422577, "step": 26380 }, { "epoch": 0.52764, "grad_norm": 1.8125, "grad_norm_var": 0.006083170572916667, "learning_rate": 0.0001, "loss": 3.7678, "loss/crossentropy": 1.6452732682228088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16541734337806702, "step": 26382 }, { "epoch": 0.52768, "grad_norm": 2.1875, "grad_norm_var": 0.020611317952473958, "learning_rate": 0.0001, "loss": 4.1566, "loss/crossentropy": 2.0002782940864563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27171435952186584, "step": 26384 }, { "epoch": 0.52772, "grad_norm": 2.0, "grad_norm_var": 0.018888092041015624, "learning_rate": 0.0001, "loss": 4.2397, "loss/crossentropy": 2.0969172716140747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20492591708898544, "step": 26386 }, { "epoch": 0.52776, "grad_norm": 1.953125, "grad_norm_var": 0.019510904947916668, "learning_rate": 0.0001, "loss": 3.8078, "loss/crossentropy": 2.238045036792755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20098427683115005, "step": 26388 }, { "epoch": 0.5278, "grad_norm": 1.8671875, "grad_norm_var": 0.021022288004557292, "learning_rate": 0.0001, "loss": 4.0391, "loss/crossentropy": 1.9958226680755615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19130828231573105, "step": 26390 }, { "epoch": 0.52784, "grad_norm": 1.9609375, "grad_norm_var": 0.020702107747395834, "learning_rate": 0.0001, "loss": 4.1161, "loss/crossentropy": 2.176322102546692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20082970708608627, "step": 26392 }, { "epoch": 0.52788, "grad_norm": 1.953125, "grad_norm_var": 0.0212066650390625, "learning_rate": 0.0001, "loss": 4.0363, "loss/crossentropy": 1.9507973790168762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17588113993406296, "step": 26394 }, { "epoch": 0.52792, "grad_norm": 1.8203125, "grad_norm_var": 0.022013346354166668, "learning_rate": 0.0001, "loss": 3.9508, "loss/crossentropy": 2.0576651096343994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19712203741073608, "step": 26396 }, { "epoch": 0.52796, "grad_norm": 1.7890625, "grad_norm_var": 0.022663370768229166, "learning_rate": 0.0001, "loss": 3.9035, "loss/crossentropy": 2.030472457408905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18470944464206696, "step": 26398 }, { "epoch": 0.528, "grad_norm": 2.078125, "grad_norm_var": 0.011378733317057292, "learning_rate": 0.0001, "loss": 3.9675, "loss/crossentropy": 2.0267462730407715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19596907496452332, "step": 26400 }, { "epoch": 0.52804, "grad_norm": 1.984375, "grad_norm_var": 0.013927968343098958, "learning_rate": 0.0001, "loss": 3.7917, "loss/crossentropy": 2.087961494922638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19787786155939102, "step": 26402 }, { "epoch": 0.52808, "grad_norm": 1.8828125, "grad_norm_var": 0.0133941650390625, "learning_rate": 0.0001, "loss": 3.7384, "loss/crossentropy": 1.6107558608055115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16004415601491928, "step": 26404 }, { "epoch": 0.52812, "grad_norm": 2.015625, "grad_norm_var": 0.010249837239583334, "learning_rate": 0.0001, "loss": 4.0522, "loss/crossentropy": 1.8615078926086426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997840330004692, "step": 26406 }, { "epoch": 0.52816, "grad_norm": 1.9765625, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 4.0445, "loss/crossentropy": 2.165738344192505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002716213464737, "step": 26408 }, { "epoch": 0.5282, "grad_norm": 1.828125, "grad_norm_var": 0.009700520833333334, "learning_rate": 0.0001, "loss": 3.9418, "loss/crossentropy": 2.039876639842987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19271554052829742, "step": 26410 }, { "epoch": 0.52824, "grad_norm": 1.8671875, "grad_norm_var": 0.008763631184895834, "learning_rate": 0.0001, "loss": 3.9841, "loss/crossentropy": 2.114060878753662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19228192418813705, "step": 26412 }, { "epoch": 0.52828, "grad_norm": 1.8671875, "grad_norm_var": 0.006998697916666667, "learning_rate": 0.0001, "loss": 4.0182, "loss/crossentropy": 2.1510127782821655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19021940976381302, "step": 26414 }, { "epoch": 0.52832, "grad_norm": 1.8515625, "grad_norm_var": 0.004986317952473959, "learning_rate": 0.0001, "loss": 3.9933, "loss/crossentropy": 1.9514253735542297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19038037955760956, "step": 26416 }, { "epoch": 0.52836, "grad_norm": 1.828125, "grad_norm_var": 0.0032867431640625, "learning_rate": 0.0001, "loss": 3.9041, "loss/crossentropy": 1.8573151230812073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17533313482999802, "step": 26418 }, { "epoch": 0.5284, "grad_norm": 1.84375, "grad_norm_var": 0.004084269205729167, "learning_rate": 0.0001, "loss": 4.1181, "loss/crossentropy": 2.2349034547805786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933293491601944, "step": 26420 }, { "epoch": 0.52844, "grad_norm": 1.953125, "grad_norm_var": 0.0031084696451822917, "learning_rate": 0.0001, "loss": 4.127, "loss/crossentropy": 1.9790935516357422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930369809269905, "step": 26422 }, { "epoch": 0.52848, "grad_norm": 1.890625, "grad_norm_var": 0.002399698893229167, "learning_rate": 0.0001, "loss": 3.9531, "loss/crossentropy": 1.9758057594299316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19405999034643173, "step": 26424 }, { "epoch": 0.52852, "grad_norm": 1.8828125, "grad_norm_var": 0.0030637105305989583, "learning_rate": 0.0001, "loss": 4.0303, "loss/crossentropy": 1.9666665196418762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19631757587194443, "step": 26426 }, { "epoch": 0.52856, "grad_norm": 1.859375, "grad_norm_var": 0.0037717183430989584, "learning_rate": 0.0001, "loss": 3.8257, "loss/crossentropy": 1.7752289175987244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16831018775701523, "step": 26428 }, { "epoch": 0.5286, "grad_norm": 2.015625, "grad_norm_var": 0.005293782552083333, "learning_rate": 0.0001, "loss": 4.0879, "loss/crossentropy": 2.1496593952178955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875729113817215, "step": 26430 }, { "epoch": 0.52864, "grad_norm": 1.9296875, "grad_norm_var": 0.005926259358723958, "learning_rate": 0.0001, "loss": 4.0407, "loss/crossentropy": 1.9487584829330444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18598007410764694, "step": 26432 }, { "epoch": 0.52868, "grad_norm": 1.859375, "grad_norm_var": 0.009886678059895833, "learning_rate": 0.0001, "loss": 3.9064, "loss/crossentropy": 1.8842872977256775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1913917362689972, "step": 26434 }, { "epoch": 0.52872, "grad_norm": 1.8671875, "grad_norm_var": 0.009913889567057292, "learning_rate": 0.0001, "loss": 4.1857, "loss/crossentropy": 2.1598324179649353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19371075928211212, "step": 26436 }, { "epoch": 0.52876, "grad_norm": 1.9140625, "grad_norm_var": 0.010228474934895834, "learning_rate": 0.0001, "loss": 4.0243, "loss/crossentropy": 1.993024468421936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19384337216615677, "step": 26438 }, { "epoch": 0.5288, "grad_norm": 1.8515625, "grad_norm_var": 0.011631011962890625, "learning_rate": 0.0001, "loss": 3.9132, "loss/crossentropy": 2.083270490169525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1818510740995407, "step": 26440 }, { "epoch": 0.52884, "grad_norm": 2.015625, "grad_norm_var": 0.011761220296223958, "learning_rate": 0.0001, "loss": 4.0291, "loss/crossentropy": 2.136199116706848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19123348593711853, "step": 26442 }, { "epoch": 0.52888, "grad_norm": 2.046875, "grad_norm_var": 0.010335286458333334, "learning_rate": 0.0001, "loss": 4.2448, "loss/crossentropy": 2.405175805091858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21493228524923325, "step": 26444 }, { "epoch": 0.52892, "grad_norm": 1.84375, "grad_norm_var": 0.010900624593098958, "learning_rate": 0.0001, "loss": 3.9793, "loss/crossentropy": 2.0925610065460205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20184864103794098, "step": 26446 }, { "epoch": 0.52896, "grad_norm": 1.90625, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 4.1213, "loss/crossentropy": 2.1476112604141235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206947922706604, "step": 26448 }, { "epoch": 0.529, "grad_norm": 1.84375, "grad_norm_var": 0.009333292643229166, "learning_rate": 0.0001, "loss": 3.9592, "loss/crossentropy": 1.7512850165367126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15776671469211578, "step": 26450 }, { "epoch": 0.52904, "grad_norm": 1.8671875, "grad_norm_var": 0.009505208333333333, "learning_rate": 0.0001, "loss": 4.0763, "loss/crossentropy": 1.766721785068512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18328477442264557, "step": 26452 }, { "epoch": 0.52908, "grad_norm": 1.890625, "grad_norm_var": 0.009178670247395833, "learning_rate": 0.0001, "loss": 4.0028, "loss/crossentropy": 1.9713833928108215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19086521863937378, "step": 26454 }, { "epoch": 0.52912, "grad_norm": 1.765625, "grad_norm_var": 0.009212239583333334, "learning_rate": 0.0001, "loss": 3.7999, "loss/crossentropy": 1.8375617861747742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799859181046486, "step": 26456 }, { "epoch": 0.52916, "grad_norm": 1.9453125, "grad_norm_var": 0.009368642171223959, "learning_rate": 0.0001, "loss": 4.0928, "loss/crossentropy": 2.3689894676208496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21843338012695312, "step": 26458 }, { "epoch": 0.5292, "grad_norm": 1.859375, "grad_norm_var": 0.008896636962890624, "learning_rate": 0.0001, "loss": 3.8911, "loss/crossentropy": 2.2842042446136475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20215284824371338, "step": 26460 }, { "epoch": 0.52924, "grad_norm": 1.9296875, "grad_norm_var": 0.008405558268229167, "learning_rate": 0.0001, "loss": 3.7891, "loss/crossentropy": 2.0660494565963745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18394764512777328, "step": 26462 }, { "epoch": 0.52928, "grad_norm": 1.9453125, "grad_norm_var": 0.009537506103515624, "learning_rate": 0.0001, "loss": 4.097, "loss/crossentropy": 2.101405918598175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051784247159958, "step": 26464 }, { "epoch": 0.52932, "grad_norm": 1.7890625, "grad_norm_var": 0.00943603515625, "learning_rate": 0.0001, "loss": 3.8075, "loss/crossentropy": 1.8293753862380981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18773258477449417, "step": 26466 }, { "epoch": 0.52936, "grad_norm": 1.703125, "grad_norm_var": 0.011115519205729167, "learning_rate": 0.0001, "loss": 3.5724, "loss/crossentropy": 1.9455705881118774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17760945856571198, "step": 26468 }, { "epoch": 0.5294, "grad_norm": 1.8828125, "grad_norm_var": 0.011508941650390625, "learning_rate": 0.0001, "loss": 3.9305, "loss/crossentropy": 1.750687837600708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18026860058307648, "step": 26470 }, { "epoch": 0.52944, "grad_norm": 1.921875, "grad_norm_var": 0.010375722249348959, "learning_rate": 0.0001, "loss": 4.0931, "loss/crossentropy": 2.189963698387146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402040541172028, "step": 26472 }, { "epoch": 0.52948, "grad_norm": 1.9453125, "grad_norm_var": 0.009549713134765625, "learning_rate": 0.0001, "loss": 4.2266, "loss/crossentropy": 1.8729415535926819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855195820331573, "step": 26474 }, { "epoch": 0.52952, "grad_norm": 1.8828125, "grad_norm_var": 0.0097412109375, "learning_rate": 0.0001, "loss": 3.5935, "loss/crossentropy": 1.8071807622909546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16968178004026413, "step": 26476 }, { "epoch": 0.52956, "grad_norm": 1.9375, "grad_norm_var": 0.010257720947265625, "learning_rate": 0.0001, "loss": 3.9832, "loss/crossentropy": 2.241515278816223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20128827542066574, "step": 26478 }, { "epoch": 0.5296, "grad_norm": 1.9453125, "grad_norm_var": 0.0062978108723958336, "learning_rate": 0.0001, "loss": 3.9792, "loss/crossentropy": 2.2276766300201416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.201935775578022, "step": 26480 }, { "epoch": 0.52964, "grad_norm": 2.015625, "grad_norm_var": 0.0061604817708333336, "learning_rate": 0.0001, "loss": 3.8511, "loss/crossentropy": 2.2751588821411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21366826444864273, "step": 26482 }, { "epoch": 0.52968, "grad_norm": 1.8984375, "grad_norm_var": 0.00335693359375, "learning_rate": 0.0001, "loss": 4.1496, "loss/crossentropy": 2.2164629697799683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2294466495513916, "step": 26484 }, { "epoch": 0.52972, "grad_norm": 1.6328125, "grad_norm_var": 0.008131663004557291, "learning_rate": 0.0001, "loss": 3.7879, "loss/crossentropy": 1.8609917163848877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1908324807882309, "step": 26486 }, { "epoch": 0.52976, "grad_norm": 1.8203125, "grad_norm_var": 0.0085601806640625, "learning_rate": 0.0001, "loss": 4.066, "loss/crossentropy": 2.050028145313263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18777167797088623, "step": 26488 }, { "epoch": 0.5298, "grad_norm": 2.078125, "grad_norm_var": 0.011329905192057291, "learning_rate": 0.0001, "loss": 4.2666, "loss/crossentropy": 2.398528814315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22428229451179504, "step": 26490 }, { "epoch": 0.52984, "grad_norm": 2.078125, "grad_norm_var": 0.012889607747395834, "learning_rate": 0.0001, "loss": 3.8208, "loss/crossentropy": 2.0193370580673218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19870522618293762, "step": 26492 }, { "epoch": 0.52988, "grad_norm": 1.84375, "grad_norm_var": 0.012369791666666666, "learning_rate": 0.0001, "loss": 3.9186, "loss/crossentropy": 2.1982868313789368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19849159568548203, "step": 26494 }, { "epoch": 0.52992, "grad_norm": 1.8203125, "grad_norm_var": 0.013405100504557291, "learning_rate": 0.0001, "loss": 3.8017, "loss/crossentropy": 1.8915838599205017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16818702220916748, "step": 26496 }, { "epoch": 0.52996, "grad_norm": 1.8671875, "grad_norm_var": 0.012596638997395833, "learning_rate": 0.0001, "loss": 3.8179, "loss/crossentropy": 1.9891789555549622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18337423354387283, "step": 26498 }, { "epoch": 0.53, "grad_norm": 2.125, "grad_norm_var": 0.015571848551432291, "learning_rate": 0.0001, "loss": 4.1336, "loss/crossentropy": 2.4187783002853394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20852266252040863, "step": 26500 }, { "epoch": 0.53004, "grad_norm": 1.7734375, "grad_norm_var": 0.01514892578125, "learning_rate": 0.0001, "loss": 3.4934, "loss/crossentropy": 1.742977499961853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17425427585840225, "step": 26502 }, { "epoch": 0.53008, "grad_norm": 1.875, "grad_norm_var": 0.017000325520833335, "learning_rate": 0.0001, "loss": 4.0615, "loss/crossentropy": 1.8980122208595276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16612428426742554, "step": 26504 }, { "epoch": 0.53012, "grad_norm": 1.8828125, "grad_norm_var": 0.013993072509765624, "learning_rate": 0.0001, "loss": 3.9807, "loss/crossentropy": 1.7788900136947632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15797217935323715, "step": 26506 }, { "epoch": 0.53016, "grad_norm": 1.9296875, "grad_norm_var": 0.011578114827473958, "learning_rate": 0.0001, "loss": 4.0657, "loss/crossentropy": 1.8558028936386108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18111377209424973, "step": 26508 }, { "epoch": 0.5302, "grad_norm": 2.03125, "grad_norm_var": 0.012886555989583333, "learning_rate": 0.0001, "loss": 4.1489, "loss/crossentropy": 2.2158637046813965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20664983242750168, "step": 26510 }, { "epoch": 0.53024, "grad_norm": 1.8671875, "grad_norm_var": 0.012347157796223958, "learning_rate": 0.0001, "loss": 4.1, "loss/crossentropy": 2.159493863582611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18167608231306076, "step": 26512 }, { "epoch": 0.53028, "grad_norm": 1.921875, "grad_norm_var": 0.012717437744140626, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 2.328041195869446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777345836162567, "step": 26514 }, { "epoch": 0.53032, "grad_norm": 1.890625, "grad_norm_var": 0.0093902587890625, "learning_rate": 0.0001, "loss": 3.7341, "loss/crossentropy": 1.724794864654541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1699744239449501, "step": 26516 }, { "epoch": 0.53036, "grad_norm": 1.9765625, "grad_norm_var": 0.006198883056640625, "learning_rate": 0.0001, "loss": 4.0218, "loss/crossentropy": 1.96088445186615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1849687099456787, "step": 26518 }, { "epoch": 0.5304, "grad_norm": 1.875, "grad_norm_var": 0.0041656494140625, "learning_rate": 0.0001, "loss": 3.9326, "loss/crossentropy": 2.150395691394806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20122019946575165, "step": 26520 }, { "epoch": 0.53044, "grad_norm": 1.9140625, "grad_norm_var": 0.004296875, "learning_rate": 0.0001, "loss": 3.9209, "loss/crossentropy": 2.016223907470703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18350518494844437, "step": 26522 }, { "epoch": 0.53048, "grad_norm": 2.0, "grad_norm_var": 0.0060791015625, "learning_rate": 0.0001, "loss": 4.3091, "loss/crossentropy": 2.334647059440613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20854850858449936, "step": 26524 }, { "epoch": 0.53052, "grad_norm": 1.9453125, "grad_norm_var": 0.0053059895833333336, "learning_rate": 0.0001, "loss": 3.8457, "loss/crossentropy": 1.928149163722992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18799126893281937, "step": 26526 }, { "epoch": 0.53056, "grad_norm": 1.8828125, "grad_norm_var": 0.005224355061848958, "learning_rate": 0.0001, "loss": 3.8411, "loss/crossentropy": 2.297117590904236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20247841626405716, "step": 26528 }, { "epoch": 0.5306, "grad_norm": 2.15625, "grad_norm_var": 0.025925445556640624, "learning_rate": 0.0001, "loss": 3.8426, "loss/crossentropy": 1.6292110085487366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1556340456008911, "step": 26530 }, { "epoch": 0.53064, "grad_norm": 1.9765625, "grad_norm_var": 0.023819732666015624, "learning_rate": 0.0001, "loss": 3.9924, "loss/crossentropy": 2.3576741218566895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22244800627231598, "step": 26532 }, { "epoch": 0.53068, "grad_norm": 1.828125, "grad_norm_var": 0.024812825520833335, "learning_rate": 0.0001, "loss": 3.8956, "loss/crossentropy": 2.412485718727112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838645040988922, "step": 26534 }, { "epoch": 0.53072, "grad_norm": 1.84375, "grad_norm_var": 0.023276519775390626, "learning_rate": 0.0001, "loss": 3.9826, "loss/crossentropy": 2.1326744556427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19059032201766968, "step": 26536 }, { "epoch": 0.53076, "grad_norm": 1.8125, "grad_norm_var": 0.023726399739583334, "learning_rate": 0.0001, "loss": 3.8374, "loss/crossentropy": 1.8022708892822266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17919087409973145, "step": 26538 }, { "epoch": 0.5308, "grad_norm": 1.9375, "grad_norm_var": 0.023563385009765625, "learning_rate": 0.0001, "loss": 3.8724, "loss/crossentropy": 2.1001075506210327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887490525841713, "step": 26540 }, { "epoch": 0.53084, "grad_norm": 1.8359375, "grad_norm_var": 0.023860677083333334, "learning_rate": 0.0001, "loss": 3.8678, "loss/crossentropy": 1.9068372249603271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752641201019287, "step": 26542 }, { "epoch": 0.53088, "grad_norm": 1.890625, "grad_norm_var": 0.023787180582682293, "learning_rate": 0.0001, "loss": 4.072, "loss/crossentropy": 1.8628905415534973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17757906019687653, "step": 26544 }, { "epoch": 0.53092, "grad_norm": 1.921875, "grad_norm_var": 0.0041196187337239586, "learning_rate": 0.0001, "loss": 4.1044, "loss/crossentropy": 1.9989616870880127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1890716478228569, "step": 26546 }, { "epoch": 0.53096, "grad_norm": 1.78125, "grad_norm_var": 0.004624176025390625, "learning_rate": 0.0001, "loss": 3.8013, "loss/crossentropy": 2.063919186592102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18740086257457733, "step": 26548 }, { "epoch": 0.531, "grad_norm": 1.953125, "grad_norm_var": 0.004355621337890625, "learning_rate": 0.0001, "loss": 4.0148, "loss/crossentropy": 1.9364339709281921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698105216026306, "step": 26550 }, { "epoch": 0.53104, "grad_norm": 1.8671875, "grad_norm_var": 0.0037859598795572918, "learning_rate": 0.0001, "loss": 4.1554, "loss/crossentropy": 2.063125193119049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881653591990471, "step": 26552 }, { "epoch": 0.53108, "grad_norm": 1.8515625, "grad_norm_var": 0.003570302327473958, "learning_rate": 0.0001, "loss": 3.9373, "loss/crossentropy": 1.9567288160324097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19221889972686768, "step": 26554 }, { "epoch": 0.53112, "grad_norm": 1.8828125, "grad_norm_var": 0.004367828369140625, "learning_rate": 0.0001, "loss": 4.0455, "loss/crossentropy": 2.177999496459961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061704844236374, "step": 26556 }, { "epoch": 0.53116, "grad_norm": 1.7421875, "grad_norm_var": 0.006052398681640625, "learning_rate": 0.0001, "loss": 3.7498, "loss/crossentropy": 1.956209123134613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16483356803655624, "step": 26558 }, { "epoch": 0.5312, "grad_norm": 1.84375, "grad_norm_var": 0.006082916259765625, "learning_rate": 0.0001, "loss": 3.9295, "loss/crossentropy": 1.778043806552887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15840457379817963, "step": 26560 }, { "epoch": 0.53124, "grad_norm": 1.9921875, "grad_norm_var": 0.0064849853515625, "learning_rate": 0.0001, "loss": 4.2495, "loss/crossentropy": 2.5685311555862427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21522855013608932, "step": 26562 }, { "epoch": 0.53128, "grad_norm": 1.875, "grad_norm_var": 0.006259918212890625, "learning_rate": 0.0001, "loss": 4.0887, "loss/crossentropy": 2.054154694080353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18618051707744598, "step": 26564 }, { "epoch": 0.53132, "grad_norm": 2.046875, "grad_norm_var": 0.011114247639973958, "learning_rate": 0.0001, "loss": 4.1652, "loss/crossentropy": 2.0676876306533813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19200553745031357, "step": 26566 }, { "epoch": 0.53136, "grad_norm": 1.921875, "grad_norm_var": 0.010817209879557291, "learning_rate": 0.0001, "loss": 3.9219, "loss/crossentropy": 2.081417202949524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19634481519460678, "step": 26568 }, { "epoch": 0.5314, "grad_norm": 1.953125, "grad_norm_var": 0.010400136311848959, "learning_rate": 0.0001, "loss": 3.911, "loss/crossentropy": 2.0342991948127747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19060657918453217, "step": 26570 }, { "epoch": 0.53144, "grad_norm": 1.8203125, "grad_norm_var": 0.012132771809895833, "learning_rate": 0.0001, "loss": 3.8132, "loss/crossentropy": 2.2454493641853333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17674832791090012, "step": 26572 }, { "epoch": 0.53148, "grad_norm": 1.921875, "grad_norm_var": 0.010927073160807292, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 2.1805293560028076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19897499680519104, "step": 26574 }, { "epoch": 0.53152, "grad_norm": 1.9921875, "grad_norm_var": 0.0107086181640625, "learning_rate": 0.0001, "loss": 4.0728, "loss/crossentropy": 2.074836492538452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20241302251815796, "step": 26576 }, { "epoch": 0.53156, "grad_norm": 1.96875, "grad_norm_var": 0.010380045572916666, "learning_rate": 0.0001, "loss": 4.0942, "loss/crossentropy": 2.341915488243103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19814413785934448, "step": 26578 }, { "epoch": 0.5316, "grad_norm": 1.8046875, "grad_norm_var": 0.010994211832682291, "learning_rate": 0.0001, "loss": 3.8701, "loss/crossentropy": 1.690186619758606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16645215451717377, "step": 26580 }, { "epoch": 0.53164, "grad_norm": 1.9296875, "grad_norm_var": 0.005614217122395833, "learning_rate": 0.0001, "loss": 3.7241, "loss/crossentropy": 2.0770626068115234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19572289288043976, "step": 26582 }, { "epoch": 0.53168, "grad_norm": 1.859375, "grad_norm_var": 0.006351470947265625, "learning_rate": 0.0001, "loss": 4.0342, "loss/crossentropy": 2.411499500274658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21515025198459625, "step": 26584 }, { "epoch": 0.53172, "grad_norm": 1.84375, "grad_norm_var": 0.0062558492024739586, "learning_rate": 0.0001, "loss": 4.1189, "loss/crossentropy": 2.0363634824752808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18869787454605103, "step": 26586 }, { "epoch": 0.53176, "grad_norm": 1.8359375, "grad_norm_var": 0.004858144124348958, "learning_rate": 0.0001, "loss": 4.049, "loss/crossentropy": 2.203519582748413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20460474491119385, "step": 26588 }, { "epoch": 0.5318, "grad_norm": 2.0, "grad_norm_var": 0.004491170247395833, "learning_rate": 0.0001, "loss": 3.8633, "loss/crossentropy": 1.9883779883384705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21096121519804, "step": 26590 }, { "epoch": 0.53184, "grad_norm": 1.96875, "grad_norm_var": 0.005332183837890625, "learning_rate": 0.0001, "loss": 4.1148, "loss/crossentropy": 2.209134042263031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23714564740657806, "step": 26592 }, { "epoch": 0.53188, "grad_norm": 1.8671875, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 2.3128318786621094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20354144275188446, "step": 26594 }, { "epoch": 0.53192, "grad_norm": 2.09375, "grad_norm_var": 0.006624348958333333, "learning_rate": 0.0001, "loss": 4.2532, "loss/crossentropy": 2.2282800674438477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072424292564392, "step": 26596 }, { "epoch": 0.53196, "grad_norm": 1.90625, "grad_norm_var": 0.0056111653645833336, "learning_rate": 0.0001, "loss": 4.0443, "loss/crossentropy": 2.086778998374939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18629200011491776, "step": 26598 }, { "epoch": 0.532, "grad_norm": 1.8359375, "grad_norm_var": 0.005773671468098958, "learning_rate": 0.0001, "loss": 3.8206, "loss/crossentropy": 2.12203449010849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955840140581131, "step": 26600 }, { "epoch": 0.53204, "grad_norm": 1.8828125, "grad_norm_var": 0.007008616129557292, "learning_rate": 0.0001, "loss": 3.7792, "loss/crossentropy": 1.987316608428955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18615661561489105, "step": 26602 }, { "epoch": 0.53208, "grad_norm": 1.9375, "grad_norm_var": 0.007228342692057291, "learning_rate": 0.0001, "loss": 4.1107, "loss/crossentropy": 2.0519716143608093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968577429652214, "step": 26604 }, { "epoch": 0.53212, "grad_norm": 1.8515625, "grad_norm_var": 0.007020823160807292, "learning_rate": 0.0001, "loss": 4.1097, "loss/crossentropy": 2.2337719202041626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19842206686735153, "step": 26606 }, { "epoch": 0.53216, "grad_norm": 1.921875, "grad_norm_var": 0.0071489969889322914, "learning_rate": 0.0001, "loss": 3.7935, "loss/crossentropy": 2.2014262080192566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037125900387764, "step": 26608 }, { "epoch": 0.5322, "grad_norm": 2.28125, "grad_norm_var": 0.0175689697265625, "learning_rate": 0.0001, "loss": 3.8532, "loss/crossentropy": 1.7964777946472168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17876359075307846, "step": 26610 }, { "epoch": 0.53224, "grad_norm": 1.8515625, "grad_norm_var": 0.0155914306640625, "learning_rate": 0.0001, "loss": 3.9521, "loss/crossentropy": 1.9710991978645325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18646740168333054, "step": 26612 }, { "epoch": 0.53228, "grad_norm": 1.7734375, "grad_norm_var": 0.016495513916015624, "learning_rate": 0.0001, "loss": 3.8435, "loss/crossentropy": 2.039173901081085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19768649339675903, "step": 26614 }, { "epoch": 0.53232, "grad_norm": 1.984375, "grad_norm_var": 0.0170318603515625, "learning_rate": 0.0001, "loss": 3.9751, "loss/crossentropy": 2.2037198543548584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104790136218071, "step": 26616 }, { "epoch": 0.53236, "grad_norm": 2.0, "grad_norm_var": 0.01741943359375, "learning_rate": 0.0001, "loss": 3.9441, "loss/crossentropy": 2.1893996596336365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21086306124925613, "step": 26618 }, { "epoch": 0.5324, "grad_norm": 1.84375, "grad_norm_var": 0.0174560546875, "learning_rate": 0.0001, "loss": 3.9598, "loss/crossentropy": 2.0324689745903015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20213106274604797, "step": 26620 }, { "epoch": 0.53244, "grad_norm": 1.984375, "grad_norm_var": 0.018244425455729168, "learning_rate": 0.0001, "loss": 3.8941, "loss/crossentropy": 2.191510498523712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20318301022052765, "step": 26622 }, { "epoch": 0.53248, "grad_norm": 2.046875, "grad_norm_var": 0.018741861979166666, "learning_rate": 0.0001, "loss": 4.131, "loss/crossentropy": 2.2092620134353638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033141329884529, "step": 26624 }, { "epoch": 0.53252, "grad_norm": 1.875, "grad_norm_var": 0.008796946207682291, "learning_rate": 0.0001, "loss": 3.8481, "loss/crossentropy": 2.078882932662964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1927858516573906, "step": 26626 }, { "epoch": 0.53256, "grad_norm": 2.109375, "grad_norm_var": 0.010599517822265625, "learning_rate": 0.0001, "loss": 4.3066, "loss/crossentropy": 2.1759738326072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21904072910547256, "step": 26628 }, { "epoch": 0.5326, "grad_norm": 1.890625, "grad_norm_var": 0.01015625, "learning_rate": 0.0001, "loss": 3.7002, "loss/crossentropy": 2.0541751980781555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17661282420158386, "step": 26630 }, { "epoch": 0.53264, "grad_norm": 1.9375, "grad_norm_var": 0.010394032796223958, "learning_rate": 0.0001, "loss": 3.7789, "loss/crossentropy": 1.8126670122146606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16475193202495575, "step": 26632 }, { "epoch": 0.53268, "grad_norm": 1.984375, "grad_norm_var": 0.009065500895182292, "learning_rate": 0.0001, "loss": 3.9798, "loss/crossentropy": 1.8467394709587097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17031905055046082, "step": 26634 }, { "epoch": 0.53272, "grad_norm": 1.9609375, "grad_norm_var": 0.008516184488932292, "learning_rate": 0.0001, "loss": 3.8392, "loss/crossentropy": 1.7537733912467957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17307621240615845, "step": 26636 }, { "epoch": 0.53276, "grad_norm": 2.078125, "grad_norm_var": 0.009273274739583334, "learning_rate": 0.0001, "loss": 4.2491, "loss/crossentropy": 2.2367827892303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301713585853577, "step": 26638 }, { "epoch": 0.5328, "grad_norm": 1.8671875, "grad_norm_var": 0.009419504801432292, "learning_rate": 0.0001, "loss": 3.9927, "loss/crossentropy": 1.9437520503997803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19594744592905045, "step": 26640 }, { "epoch": 0.53284, "grad_norm": 1.9765625, "grad_norm_var": 0.0102691650390625, "learning_rate": 0.0001, "loss": 4.1308, "loss/crossentropy": 1.9712047576904297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043192759156227, "step": 26642 }, { "epoch": 0.53288, "grad_norm": 1.8671875, "grad_norm_var": 0.0079498291015625, "learning_rate": 0.0001, "loss": 3.916, "loss/crossentropy": 1.951958179473877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054751068353653, "step": 26644 }, { "epoch": 0.53292, "grad_norm": 1.890625, "grad_norm_var": 0.0067291259765625, "learning_rate": 0.0001, "loss": 3.929, "loss/crossentropy": 2.0811157822608948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19759366661310196, "step": 26646 }, { "epoch": 0.53296, "grad_norm": 1.9609375, "grad_norm_var": 0.06748428344726562, "learning_rate": 0.0001, "loss": 4.0009, "loss/crossentropy": 2.0309654474258423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1910555213689804, "step": 26648 }, { "epoch": 0.533, "grad_norm": 1.78125, "grad_norm_var": 0.07044448852539062, "learning_rate": 0.0001, "loss": 3.7709, "loss/crossentropy": 2.003150999546051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19752421230077744, "step": 26650 }, { "epoch": 0.53304, "grad_norm": 1.9140625, "grad_norm_var": 0.0694091796875, "learning_rate": 0.0001, "loss": 3.9324, "loss/crossentropy": 1.788662612438202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17759086191654205, "step": 26652 }, { "epoch": 0.53308, "grad_norm": 1.859375, "grad_norm_var": 0.07455240885416667, "learning_rate": 0.0001, "loss": 3.835, "loss/crossentropy": 1.7692713737487793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16581754386425018, "step": 26654 }, { "epoch": 0.53312, "grad_norm": 1.859375, "grad_norm_var": 0.07297337849934896, "learning_rate": 0.0001, "loss": 3.798, "loss/crossentropy": 1.8395215272903442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18920384347438812, "step": 26656 }, { "epoch": 0.53316, "grad_norm": 1.7734375, "grad_norm_var": 0.07563247680664062, "learning_rate": 0.0001, "loss": 4.0133, "loss/crossentropy": 2.0375067591667175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19706164300441742, "step": 26658 }, { "epoch": 0.5332, "grad_norm": 2.0, "grad_norm_var": 0.076025390625, "learning_rate": 0.0001, "loss": 3.9964, "loss/crossentropy": 2.101609468460083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20092131942510605, "step": 26660 }, { "epoch": 0.53324, "grad_norm": 1.8984375, "grad_norm_var": 0.07556864420572916, "learning_rate": 0.0001, "loss": 3.8518, "loss/crossentropy": 1.9724875092506409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19354254007339478, "step": 26662 }, { "epoch": 0.53328, "grad_norm": 1.828125, "grad_norm_var": 0.015000152587890624, "learning_rate": 0.0001, "loss": 4.199, "loss/crossentropy": 1.9491792917251587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21028251200914383, "step": 26664 }, { "epoch": 0.53332, "grad_norm": 2.03125, "grad_norm_var": 0.012133534749348958, "learning_rate": 0.0001, "loss": 3.8784, "loss/crossentropy": 2.0189873576164246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20224104076623917, "step": 26666 }, { "epoch": 0.53336, "grad_norm": 1.9140625, "grad_norm_var": 0.01177978515625, "learning_rate": 0.0001, "loss": 3.6177, "loss/crossentropy": 2.0183663368225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20128542184829712, "step": 26668 }, { "epoch": 0.5334, "grad_norm": 1.8125, "grad_norm_var": 0.01939697265625, "learning_rate": 0.0001, "loss": 4.2613, "loss/crossentropy": 2.323311448097229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202368974685669, "step": 26670 }, { "epoch": 0.53344, "grad_norm": 2.0, "grad_norm_var": 0.01764094034830729, "learning_rate": 0.0001, "loss": 4.2713, "loss/crossentropy": 2.14883291721344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21048131585121155, "step": 26672 }, { "epoch": 0.53348, "grad_norm": 1.96875, "grad_norm_var": 0.015313466389973959, "learning_rate": 0.0001, "loss": 4.2169, "loss/crossentropy": 2.0525609254837036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19780127704143524, "step": 26674 }, { "epoch": 0.53352, "grad_norm": 1.7734375, "grad_norm_var": 0.016900380452473957, "learning_rate": 0.0001, "loss": 4.0969, "loss/crossentropy": 2.125749707221985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18475912511348724, "step": 26676 }, { "epoch": 0.53356, "grad_norm": 1.9765625, "grad_norm_var": 0.016044108072916667, "learning_rate": 0.0001, "loss": 4.1139, "loss/crossentropy": 2.3169859647750854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21877627819776535, "step": 26678 }, { "epoch": 0.5336, "grad_norm": 1.984375, "grad_norm_var": 0.014525349934895833, "learning_rate": 0.0001, "loss": 3.9182, "loss/crossentropy": 2.263182818889618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20379970222711563, "step": 26680 }, { "epoch": 0.53364, "grad_norm": 1.8359375, "grad_norm_var": 0.014737701416015625, "learning_rate": 0.0001, "loss": 4.1722, "loss/crossentropy": 2.2744826078414917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20668719708919525, "step": 26682 }, { "epoch": 0.53368, "grad_norm": 1.8828125, "grad_norm_var": 0.014872233072916666, "learning_rate": 0.0001, "loss": 3.9593, "loss/crossentropy": 2.2624911665916443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010696530342102, "step": 26684 }, { "epoch": 0.53372, "grad_norm": 1.875, "grad_norm_var": 0.005456288655598958, "learning_rate": 0.0001, "loss": 3.7412, "loss/crossentropy": 2.2623918056488037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20084036886692047, "step": 26686 }, { "epoch": 0.53376, "grad_norm": 2.1875, "grad_norm_var": 0.013960774739583333, "learning_rate": 0.0001, "loss": 4.2368, "loss/crossentropy": 2.116790533065796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22413402050733566, "step": 26688 }, { "epoch": 0.5338, "grad_norm": 1.90625, "grad_norm_var": 0.013881174723307292, "learning_rate": 0.0001, "loss": 3.8506, "loss/crossentropy": 1.9280251860618591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19084317982196808, "step": 26690 }, { "epoch": 0.53384, "grad_norm": 1.8046875, "grad_norm_var": 0.013490549723307292, "learning_rate": 0.0001, "loss": 3.727, "loss/crossentropy": 1.8203374743461609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16682762652635574, "step": 26692 }, { "epoch": 0.53388, "grad_norm": 1.9296875, "grad_norm_var": 0.013337961832682292, "learning_rate": 0.0001, "loss": 3.7433, "loss/crossentropy": 1.9362438321113586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947326734662056, "step": 26694 }, { "epoch": 0.53392, "grad_norm": 1.9375, "grad_norm_var": 0.015148671468098958, "learning_rate": 0.0001, "loss": 4.1108, "loss/crossentropy": 1.945202112197876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2301146537065506, "step": 26696 }, { "epoch": 0.53396, "grad_norm": 1.78125, "grad_norm_var": 0.016087849934895832, "learning_rate": 0.0001, "loss": 3.8197, "loss/crossentropy": 1.986995279788971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18129996210336685, "step": 26698 }, { "epoch": 0.534, "grad_norm": 1.9375, "grad_norm_var": 0.01623713175455729, "learning_rate": 0.0001, "loss": 3.9488, "loss/crossentropy": 2.157664656639099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106902152299881, "step": 26700 }, { "epoch": 0.53404, "grad_norm": 2.09375, "grad_norm_var": 0.014839680989583333, "learning_rate": 0.0001, "loss": 4.1374, "loss/crossentropy": 1.9703331589698792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21851468831300735, "step": 26702 }, { "epoch": 0.53408, "grad_norm": 1.8515625, "grad_norm_var": 0.008160146077473958, "learning_rate": 0.0001, "loss": 3.9243, "loss/crossentropy": 2.1339242458343506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987123116850853, "step": 26704 }, { "epoch": 0.53412, "grad_norm": 1.9140625, "grad_norm_var": 0.0085357666015625, "learning_rate": 0.0001, "loss": 4.0897, "loss/crossentropy": 1.9810134768486023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18423764407634735, "step": 26706 }, { "epoch": 0.53416, "grad_norm": 1.7890625, "grad_norm_var": 0.008695475260416667, "learning_rate": 0.0001, "loss": 4.0445, "loss/crossentropy": 1.8842402696609497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19414211809635162, "step": 26708 }, { "epoch": 0.5342, "grad_norm": 2.0625, "grad_norm_var": 0.0121490478515625, "learning_rate": 0.0001, "loss": 4.0123, "loss/crossentropy": 2.0668699741363525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19577328860759735, "step": 26710 }, { "epoch": 0.53424, "grad_norm": 1.9453125, "grad_norm_var": 0.009276326497395833, "learning_rate": 0.0001, "loss": 3.913, "loss/crossentropy": 2.2768534421920776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21711596846580505, "step": 26712 }, { "epoch": 0.53428, "grad_norm": 1.8671875, "grad_norm_var": 0.008872222900390626, "learning_rate": 0.0001, "loss": 4.0675, "loss/crossentropy": 2.1378949880599976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20244380831718445, "step": 26714 }, { "epoch": 0.53432, "grad_norm": 1.9140625, "grad_norm_var": 0.009798177083333333, "learning_rate": 0.0001, "loss": 4.0259, "loss/crossentropy": 2.4403127431869507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2065192013978958, "step": 26716 }, { "epoch": 0.53436, "grad_norm": 1.828125, "grad_norm_var": 0.008324178059895833, "learning_rate": 0.0001, "loss": 3.8731, "loss/crossentropy": 1.8811705708503723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881936937570572, "step": 26718 }, { "epoch": 0.5344, "grad_norm": 1.921875, "grad_norm_var": 0.010682932535807292, "learning_rate": 0.0001, "loss": 4.3268, "loss/crossentropy": 2.331605315208435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20130542665719986, "step": 26720 }, { "epoch": 0.53444, "grad_norm": 1.9765625, "grad_norm_var": 0.011763509114583333, "learning_rate": 0.0001, "loss": 3.6066, "loss/crossentropy": 1.521777868270874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1611887440085411, "step": 26722 }, { "epoch": 0.53448, "grad_norm": 2.0625, "grad_norm_var": 0.011905924479166666, "learning_rate": 0.0001, "loss": 4.0351, "loss/crossentropy": 2.0702260732650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21350041031837463, "step": 26724 }, { "epoch": 0.53452, "grad_norm": 2.03125, "grad_norm_var": 0.008760579427083333, "learning_rate": 0.0001, "loss": 4.2144, "loss/crossentropy": 2.2656288146972656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1988365724682808, "step": 26726 }, { "epoch": 0.53456, "grad_norm": 2.046875, "grad_norm_var": 0.008821614583333333, "learning_rate": 0.0001, "loss": 3.9917, "loss/crossentropy": 1.8514603972434998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230767339468002, "step": 26728 }, { "epoch": 0.5346, "grad_norm": 1.96875, "grad_norm_var": 0.0082672119140625, "learning_rate": 0.0001, "loss": 3.6906, "loss/crossentropy": 1.682263970375061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19245991110801697, "step": 26730 }, { "epoch": 0.53464, "grad_norm": 1.84375, "grad_norm_var": 0.008778635660807292, "learning_rate": 0.0001, "loss": 3.9556, "loss/crossentropy": 2.0426923036575317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19547611474990845, "step": 26732 }, { "epoch": 0.53468, "grad_norm": 1.9296875, "grad_norm_var": 0.007826487223307291, "learning_rate": 0.0001, "loss": 3.8705, "loss/crossentropy": 2.0580697059631348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17934934794902802, "step": 26734 }, { "epoch": 0.53472, "grad_norm": 2.015625, "grad_norm_var": 0.0060791015625, "learning_rate": 0.0001, "loss": 4.0613, "loss/crossentropy": 2.2542420625686646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21669819951057434, "step": 26736 }, { "epoch": 0.53476, "grad_norm": 1.84375, "grad_norm_var": 0.004740142822265625, "learning_rate": 0.0001, "loss": 3.8354, "loss/crossentropy": 2.0720590949058533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18977323174476624, "step": 26738 }, { "epoch": 0.5348, "grad_norm": 1.875, "grad_norm_var": 0.007134755452473958, "learning_rate": 0.0001, "loss": 3.7576, "loss/crossentropy": 2.1534887552261353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1833745241165161, "step": 26740 }, { "epoch": 0.53484, "grad_norm": 2.015625, "grad_norm_var": 0.008381907145182292, "learning_rate": 0.0001, "loss": 3.9765, "loss/crossentropy": 1.4169192910194397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14363554865121841, "step": 26742 }, { "epoch": 0.53488, "grad_norm": 1.953125, "grad_norm_var": 0.007340240478515625, "learning_rate": 0.0001, "loss": 4.046, "loss/crossentropy": 2.149993062019348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20160876959562302, "step": 26744 }, { "epoch": 0.53492, "grad_norm": 1.8203125, "grad_norm_var": 0.007433827718098958, "learning_rate": 0.0001, "loss": 4.0038, "loss/crossentropy": 2.032483458518982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18361316621303558, "step": 26746 }, { "epoch": 0.53496, "grad_norm": 1.8984375, "grad_norm_var": 0.007979329427083333, "learning_rate": 0.0001, "loss": 4.0003, "loss/crossentropy": 2.253453016281128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20721124857664108, "step": 26748 }, { "epoch": 0.535, "grad_norm": 1.796875, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 3.983, "loss/crossentropy": 1.8833884000778198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19324089586734772, "step": 26750 }, { "epoch": 0.53504, "grad_norm": 2.078125, "grad_norm_var": 0.011427561442057291, "learning_rate": 0.0001, "loss": 4.1681, "loss/crossentropy": 2.146417558193207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19708352535963058, "step": 26752 }, { "epoch": 0.53508, "grad_norm": 2.109375, "grad_norm_var": 0.013248443603515625, "learning_rate": 0.0001, "loss": 4.1508, "loss/crossentropy": 1.9810682535171509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777956709265709, "step": 26754 }, { "epoch": 0.53512, "grad_norm": 1.859375, "grad_norm_var": 0.010426584879557292, "learning_rate": 0.0001, "loss": 3.9092, "loss/crossentropy": 2.0470046401023865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1866726651787758, "step": 26756 }, { "epoch": 0.53516, "grad_norm": 1.8046875, "grad_norm_var": 0.009287261962890625, "learning_rate": 0.0001, "loss": 3.9476, "loss/crossentropy": 2.0760093927383423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18907316029071808, "step": 26758 }, { "epoch": 0.5352, "grad_norm": 2.015625, "grad_norm_var": 0.011110178629557292, "learning_rate": 0.0001, "loss": 3.8118, "loss/crossentropy": 1.7074882984161377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16364683210849762, "step": 26760 }, { "epoch": 0.53524, "grad_norm": 1.8828125, "grad_norm_var": 0.011614735921223958, "learning_rate": 0.0001, "loss": 4.196, "loss/crossentropy": 1.925625503063202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18412255495786667, "step": 26762 }, { "epoch": 0.53528, "grad_norm": 2.0, "grad_norm_var": 0.012962849934895833, "learning_rate": 0.0001, "loss": 3.9354, "loss/crossentropy": 2.03251188993454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895642802119255, "step": 26764 }, { "epoch": 0.53532, "grad_norm": 2.03125, "grad_norm_var": 0.013012440999348958, "learning_rate": 0.0001, "loss": 3.9456, "loss/crossentropy": 1.9689412117004395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17626702040433884, "step": 26766 }, { "epoch": 0.53536, "grad_norm": 1.921875, "grad_norm_var": 0.011864980061848959, "learning_rate": 0.0001, "loss": 3.9408, "loss/crossentropy": 1.7644684314727783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18325244635343552, "step": 26768 }, { "epoch": 0.5354, "grad_norm": 1.984375, "grad_norm_var": 0.009544881184895833, "learning_rate": 0.0001, "loss": 3.8692, "loss/crossentropy": 1.9543325304985046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18265419453382492, "step": 26770 }, { "epoch": 0.53544, "grad_norm": 1.9921875, "grad_norm_var": 0.012703196207682291, "learning_rate": 0.0001, "loss": 4.2303, "loss/crossentropy": 2.107349157333374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18807468563318253, "step": 26772 }, { "epoch": 0.53548, "grad_norm": 1.9140625, "grad_norm_var": 0.022900390625, "learning_rate": 0.0001, "loss": 4.1627, "loss/crossentropy": 2.0298044085502625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947350949048996, "step": 26774 }, { "epoch": 0.53552, "grad_norm": 1.953125, "grad_norm_var": 0.020491536458333334, "learning_rate": 0.0001, "loss": 4.06, "loss/crossentropy": 2.1605560183525085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067297026515007, "step": 26776 }, { "epoch": 0.53556, "grad_norm": 2.15625, "grad_norm_var": 0.021762847900390625, "learning_rate": 0.0001, "loss": 4.2464, "loss/crossentropy": 1.9689620733261108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24544911086559296, "step": 26778 }, { "epoch": 0.5356, "grad_norm": 1.953125, "grad_norm_var": 0.018723297119140624, "learning_rate": 0.0001, "loss": 3.93, "loss/crossentropy": 1.7339588403701782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21119988709688187, "step": 26780 }, { "epoch": 0.53564, "grad_norm": 1.9609375, "grad_norm_var": 0.018260701497395834, "learning_rate": 0.0001, "loss": 3.8853, "loss/crossentropy": 2.3095227479934692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19727592915296555, "step": 26782 }, { "epoch": 0.53568, "grad_norm": 1.8515625, "grad_norm_var": 0.017661285400390626, "learning_rate": 0.0001, "loss": 3.9575, "loss/crossentropy": 2.1668676137924194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18963289260864258, "step": 26784 }, { "epoch": 0.53572, "grad_norm": 2.0, "grad_norm_var": 0.018683878580729167, "learning_rate": 0.0001, "loss": 3.7399, "loss/crossentropy": 1.7347453236579895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1699044108390808, "step": 26786 }, { "epoch": 0.53576, "grad_norm": 1.8515625, "grad_norm_var": 0.0186431884765625, "learning_rate": 0.0001, "loss": 3.9689, "loss/crossentropy": 2.144263744354248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21406567096710205, "step": 26788 }, { "epoch": 0.5358, "grad_norm": 1.9921875, "grad_norm_var": 0.008202870686848959, "learning_rate": 0.0001, "loss": 4.0654, "loss/crossentropy": 2.2909047603607178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19608308374881744, "step": 26790 }, { "epoch": 0.53584, "grad_norm": 1.9453125, "grad_norm_var": 0.0089752197265625, "learning_rate": 0.0001, "loss": 3.748, "loss/crossentropy": 2.0192511677742004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19014007598161697, "step": 26792 }, { "epoch": 0.53588, "grad_norm": 1.9453125, "grad_norm_var": 0.005879720052083333, "learning_rate": 0.0001, "loss": 4.1532, "loss/crossentropy": 1.848621129989624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16638639569282532, "step": 26794 }, { "epoch": 0.53592, "grad_norm": 1.7890625, "grad_norm_var": 0.008128865559895834, "learning_rate": 0.0001, "loss": 3.8812, "loss/crossentropy": 1.8177622556686401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17412111163139343, "step": 26796 }, { "epoch": 0.53596, "grad_norm": 1.8671875, "grad_norm_var": 0.009490712483723959, "learning_rate": 0.0001, "loss": 3.8414, "loss/crossentropy": 1.8192716240882874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1820041686296463, "step": 26798 }, { "epoch": 0.536, "grad_norm": 1.8671875, "grad_norm_var": 0.037534332275390624, "learning_rate": 0.0001, "loss": 3.9625, "loss/crossentropy": 2.037019371986389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19533763080835342, "step": 26800 }, { "epoch": 0.53604, "grad_norm": 1.828125, "grad_norm_var": 0.03748779296875, "learning_rate": 0.0001, "loss": 4.1231, "loss/crossentropy": 2.3284034729003906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20415876060724258, "step": 26802 }, { "epoch": 0.53608, "grad_norm": 2.140625, "grad_norm_var": 0.04108861287434896, "learning_rate": 0.0001, "loss": 4.5056, "loss/crossentropy": 1.9130100011825562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19624194502830505, "step": 26804 }, { "epoch": 0.53612, "grad_norm": 1.859375, "grad_norm_var": 0.04149754842122396, "learning_rate": 0.0001, "loss": 3.9375, "loss/crossentropy": 1.9683635234832764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2047107145190239, "step": 26806 }, { "epoch": 0.53616, "grad_norm": 1.890625, "grad_norm_var": 0.0410552978515625, "learning_rate": 0.0001, "loss": 4.0838, "loss/crossentropy": 2.3120559453964233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024022340774536, "step": 26808 }, { "epoch": 0.5362, "grad_norm": 1.9140625, "grad_norm_var": 0.0410552978515625, "learning_rate": 0.0001, "loss": 3.9571, "loss/crossentropy": 2.0975415110588074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19205840677022934, "step": 26810 }, { "epoch": 0.53624, "grad_norm": 1.9609375, "grad_norm_var": 0.03737564086914062, "learning_rate": 0.0001, "loss": 3.9933, "loss/crossentropy": 1.995516061782837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18409030139446259, "step": 26812 }, { "epoch": 0.53628, "grad_norm": 1.796875, "grad_norm_var": 0.03712539672851563, "learning_rate": 0.0001, "loss": 3.5401, "loss/crossentropy": 1.9188540577888489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17965808510780334, "step": 26814 }, { "epoch": 0.53632, "grad_norm": 1.9375, "grad_norm_var": 0.03156636555989583, "learning_rate": 0.0001, "loss": 4.127, "loss/crossentropy": 2.1765600442886353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20042569190263748, "step": 26816 }, { "epoch": 0.53636, "grad_norm": 1.96875, "grad_norm_var": 0.031243642171223957, "learning_rate": 0.0001, "loss": 4.1306, "loss/crossentropy": 2.2123221158981323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19921905547380447, "step": 26818 }, { "epoch": 0.5364, "grad_norm": 1.9296875, "grad_norm_var": 0.027950032552083334, "learning_rate": 0.0001, "loss": 4.091, "loss/crossentropy": 2.130329966545105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274868607521057, "step": 26820 }, { "epoch": 0.53644, "grad_norm": 1.8671875, "grad_norm_var": 0.02783788045247396, "learning_rate": 0.0001, "loss": 3.9039, "loss/crossentropy": 1.8405003547668457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1754842922091484, "step": 26822 }, { "epoch": 0.53648, "grad_norm": 1.9765625, "grad_norm_var": 0.027611287434895833, "learning_rate": 0.0001, "loss": 4.1288, "loss/crossentropy": 2.1880215406417847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983429118990898, "step": 26824 }, { "epoch": 0.53652, "grad_norm": 1.9453125, "grad_norm_var": 0.027486165364583332, "learning_rate": 0.0001, "loss": 4.0219, "loss/crossentropy": 2.1088255643844604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2141232043504715, "step": 26826 }, { "epoch": 0.53656, "grad_norm": 1.8125, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 3.8428, "loss/crossentropy": 2.2226059436798096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20068518072366714, "step": 26828 }, { "epoch": 0.5366, "grad_norm": 1.9453125, "grad_norm_var": 0.02655614217122396, "learning_rate": 0.0001, "loss": 3.9276, "loss/crossentropy": 2.0935966968536377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20954880118370056, "step": 26830 }, { "epoch": 0.53664, "grad_norm": 1.8828125, "grad_norm_var": 0.0034238179524739582, "learning_rate": 0.0001, "loss": 4.1073, "loss/crossentropy": 1.9899848699569702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19131190329790115, "step": 26832 }, { "epoch": 0.53668, "grad_norm": 1.9296875, "grad_norm_var": 0.002854156494140625, "learning_rate": 0.0001, "loss": 4.0719, "loss/crossentropy": 1.9950811862945557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20007413625717163, "step": 26834 }, { "epoch": 0.53672, "grad_norm": 1.953125, "grad_norm_var": 0.003763580322265625, "learning_rate": 0.0001, "loss": 3.6452, "loss/crossentropy": 2.1717607975006104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19856677949428558, "step": 26836 }, { "epoch": 0.53676, "grad_norm": 2.0, "grad_norm_var": 0.003780110677083333, "learning_rate": 0.0001, "loss": 4.1077, "loss/crossentropy": 2.0720032453536987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17657846212387085, "step": 26838 }, { "epoch": 0.5368, "grad_norm": 1.90625, "grad_norm_var": 0.005541737874348958, "learning_rate": 0.0001, "loss": 3.776, "loss/crossentropy": 2.0533303022384644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19667675346136093, "step": 26840 }, { "epoch": 0.53684, "grad_norm": 1.9921875, "grad_norm_var": 0.005844879150390625, "learning_rate": 0.0001, "loss": 3.9664, "loss/crossentropy": 1.8576315641403198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17336532473564148, "step": 26842 }, { "epoch": 0.53688, "grad_norm": 2.0625, "grad_norm_var": 0.0067179361979166664, "learning_rate": 0.0001, "loss": 4.1348, "loss/crossentropy": 2.250246524810791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19317224621772766, "step": 26844 }, { "epoch": 0.53692, "grad_norm": 1.90625, "grad_norm_var": 0.006239573160807292, "learning_rate": 0.0001, "loss": 4.2467, "loss/crossentropy": 2.241440773010254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20358380675315857, "step": 26846 }, { "epoch": 0.53696, "grad_norm": 2.125, "grad_norm_var": 0.009332021077473959, "learning_rate": 0.0001, "loss": 4.1142, "loss/crossentropy": 1.8086896538734436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17357632517814636, "step": 26848 }, { "epoch": 0.537, "grad_norm": 2.078125, "grad_norm_var": 0.010695139567057291, "learning_rate": 0.0001, "loss": 4.3621, "loss/crossentropy": 2.396283984184265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21695053577423096, "step": 26850 }, { "epoch": 0.53704, "grad_norm": 2.09375, "grad_norm_var": 0.010477447509765625, "learning_rate": 0.0001, "loss": 4.2781, "loss/crossentropy": 2.1522679328918457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21720080822706223, "step": 26852 }, { "epoch": 0.53708, "grad_norm": 1.90625, "grad_norm_var": 0.011763254801432291, "learning_rate": 0.0001, "loss": 3.8039, "loss/crossentropy": 1.5082102417945862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15677820146083832, "step": 26854 }, { "epoch": 0.53712, "grad_norm": 1.875, "grad_norm_var": 0.014855702718098959, "learning_rate": 0.0001, "loss": 4.1837, "loss/crossentropy": 2.1900625228881836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19950998574495316, "step": 26856 }, { "epoch": 0.53716, "grad_norm": 1.984375, "grad_norm_var": 0.015409088134765625, "learning_rate": 0.0001, "loss": 4.2357, "loss/crossentropy": 1.978192925453186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19752255082130432, "step": 26858 }, { "epoch": 0.5372, "grad_norm": 1.859375, "grad_norm_var": 0.015962727864583335, "learning_rate": 0.0001, "loss": 4.0597, "loss/crossentropy": 1.9195253252983093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122282087802887, "step": 26860 }, { "epoch": 0.53724, "grad_norm": 1.921875, "grad_norm_var": 0.015913899739583334, "learning_rate": 0.0001, "loss": 4.0172, "loss/crossentropy": 1.7090142369270325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1740838885307312, "step": 26862 }, { "epoch": 0.53728, "grad_norm": 1.9296875, "grad_norm_var": 0.029788970947265625, "learning_rate": 0.0001, "loss": 3.9981, "loss/crossentropy": 1.9377044439315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18080267310142517, "step": 26864 }, { "epoch": 0.53732, "grad_norm": 1.9140625, "grad_norm_var": 0.029499308268229166, "learning_rate": 0.0001, "loss": 3.7442, "loss/crossentropy": 1.635703206062317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16993890702724457, "step": 26866 }, { "epoch": 0.53736, "grad_norm": 1.8359375, "grad_norm_var": 0.03104426066080729, "learning_rate": 0.0001, "loss": 3.6425, "loss/crossentropy": 2.10710072517395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069242298603058, "step": 26868 }, { "epoch": 0.5374, "grad_norm": 1.859375, "grad_norm_var": 0.030543772379557292, "learning_rate": 0.0001, "loss": 3.9258, "loss/crossentropy": 1.9034352898597717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17317350208759308, "step": 26870 }, { "epoch": 0.53744, "grad_norm": 1.8125, "grad_norm_var": 0.027032216389973957, "learning_rate": 0.0001, "loss": 3.7471, "loss/crossentropy": 1.97059965133667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18057716637849808, "step": 26872 }, { "epoch": 0.53748, "grad_norm": 1.96875, "grad_norm_var": 0.02578913370768229, "learning_rate": 0.0001, "loss": 3.92, "loss/crossentropy": 2.11410790681839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19710883498191833, "step": 26874 }, { "epoch": 0.53752, "grad_norm": 1.859375, "grad_norm_var": 0.026652018229166668, "learning_rate": 0.0001, "loss": 3.7229, "loss/crossentropy": 2.1530507802963257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18232395499944687, "step": 26876 }, { "epoch": 0.53756, "grad_norm": 1.8203125, "grad_norm_var": 0.027286783854166666, "learning_rate": 0.0001, "loss": 3.9919, "loss/crossentropy": 1.9864270687103271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18189801275730133, "step": 26878 }, { "epoch": 0.5376, "grad_norm": 1.9140625, "grad_norm_var": 0.004610188802083333, "learning_rate": 0.0001, "loss": 3.8101, "loss/crossentropy": 1.7933497428894043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16698282212018967, "step": 26880 }, { "epoch": 0.53764, "grad_norm": 1.984375, "grad_norm_var": 0.005102284749348958, "learning_rate": 0.0001, "loss": 4.096, "loss/crossentropy": 2.1818475127220154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2031894475221634, "step": 26882 }, { "epoch": 0.53768, "grad_norm": 1.9140625, "grad_norm_var": 0.004980214436848958, "learning_rate": 0.0001, "loss": 3.8372, "loss/crossentropy": 1.7670655250549316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976353645324707, "step": 26884 }, { "epoch": 0.53772, "grad_norm": 1.8828125, "grad_norm_var": 0.007252756754557292, "learning_rate": 0.0001, "loss": 3.7664, "loss/crossentropy": 1.9625222086906433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887579709291458, "step": 26886 }, { "epoch": 0.53776, "grad_norm": 1.890625, "grad_norm_var": 0.006082916259765625, "learning_rate": 0.0001, "loss": 4.065, "loss/crossentropy": 1.989456593990326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18352140486240387, "step": 26888 }, { "epoch": 0.5378, "grad_norm": 1.953125, "grad_norm_var": 0.0054595947265625, "learning_rate": 0.0001, "loss": 3.8264, "loss/crossentropy": 1.9268362522125244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18406879901885986, "step": 26890 }, { "epoch": 0.53784, "grad_norm": 1.7890625, "grad_norm_var": 0.005515289306640625, "learning_rate": 0.0001, "loss": 4.0215, "loss/crossentropy": 2.3704354763031006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20472833514213562, "step": 26892 }, { "epoch": 0.53788, "grad_norm": 1.8515625, "grad_norm_var": 0.006831614176432291, "learning_rate": 0.0001, "loss": 4.0136, "loss/crossentropy": 2.2468762397766113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20067602396011353, "step": 26894 }, { "epoch": 0.53792, "grad_norm": 1.9453125, "grad_norm_var": 0.007456207275390625, "learning_rate": 0.0001, "loss": 4.0154, "loss/crossentropy": 1.9968597888946533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19497767835855484, "step": 26896 }, { "epoch": 0.53796, "grad_norm": 2.03125, "grad_norm_var": 0.008861287434895834, "learning_rate": 0.0001, "loss": 4.3752, "loss/crossentropy": 2.3659461736679077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061542198061943, "step": 26898 }, { "epoch": 0.538, "grad_norm": 1.8203125, "grad_norm_var": 0.00928955078125, "learning_rate": 0.0001, "loss": 3.7333, "loss/crossentropy": 2.1831624507904053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18195770680904388, "step": 26900 }, { "epoch": 0.53804, "grad_norm": 2.03125, "grad_norm_var": 0.007419586181640625, "learning_rate": 0.0001, "loss": 3.9325, "loss/crossentropy": 2.011473059654236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17340296506881714, "step": 26902 }, { "epoch": 0.53808, "grad_norm": 2.109375, "grad_norm_var": 0.009751129150390624, "learning_rate": 0.0001, "loss": 4.2062, "loss/crossentropy": 2.0382936000823975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20788908749818802, "step": 26904 }, { "epoch": 0.53812, "grad_norm": 2.015625, "grad_norm_var": 0.011530558268229166, "learning_rate": 0.0001, "loss": 3.7917, "loss/crossentropy": 1.9090477228164673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1654892861843109, "step": 26906 }, { "epoch": 0.53816, "grad_norm": 2.15625, "grad_norm_var": 0.013850657145182292, "learning_rate": 0.0001, "loss": 4.1095, "loss/crossentropy": 2.1782519817352295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20165212452411652, "step": 26908 }, { "epoch": 0.5382, "grad_norm": 2.0625, "grad_norm_var": 0.0138336181640625, "learning_rate": 0.0001, "loss": 4.0269, "loss/crossentropy": 2.3456512689590454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19297856092453003, "step": 26910 }, { "epoch": 0.53824, "grad_norm": 2.046875, "grad_norm_var": 0.013108062744140624, "learning_rate": 0.0001, "loss": 4.1148, "loss/crossentropy": 2.1863032579421997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19647317379713058, "step": 26912 }, { "epoch": 0.53828, "grad_norm": 1.8359375, "grad_norm_var": 0.013187408447265625, "learning_rate": 0.0001, "loss": 3.8884, "loss/crossentropy": 1.824396550655365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18155476450920105, "step": 26914 }, { "epoch": 0.53832, "grad_norm": 1.9765625, "grad_norm_var": 0.013630930582682292, "learning_rate": 0.0001, "loss": 3.9332, "loss/crossentropy": 1.913030207157135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16873834282159805, "step": 26916 }, { "epoch": 0.53836, "grad_norm": 1.9140625, "grad_norm_var": 0.012780507405598959, "learning_rate": 0.0001, "loss": 4.0736, "loss/crossentropy": 1.791024386882782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17709940671920776, "step": 26918 }, { "epoch": 0.5384, "grad_norm": 1.9921875, "grad_norm_var": 0.010857899983723959, "learning_rate": 0.0001, "loss": 4.0551, "loss/crossentropy": 1.9327716827392578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17906151711940765, "step": 26920 }, { "epoch": 0.53844, "grad_norm": 1.8046875, "grad_norm_var": 0.010290273030598958, "learning_rate": 0.0001, "loss": 3.9922, "loss/crossentropy": 2.1072241067886353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19492276012897491, "step": 26922 }, { "epoch": 0.53848, "grad_norm": 1.796875, "grad_norm_var": 0.009993235270182291, "learning_rate": 0.0001, "loss": 4.039, "loss/crossentropy": 2.024174690246582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018662467598915, "step": 26924 }, { "epoch": 0.53852, "grad_norm": 1.90625, "grad_norm_var": 0.00892333984375, "learning_rate": 0.0001, "loss": 3.988, "loss/crossentropy": 1.9384547472000122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1663040593266487, "step": 26926 }, { "epoch": 0.53856, "grad_norm": 1.8828125, "grad_norm_var": 0.007692209879557292, "learning_rate": 0.0001, "loss": 3.7386, "loss/crossentropy": 2.0045499205589294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18697284162044525, "step": 26928 }, { "epoch": 0.5386, "grad_norm": 1.9921875, "grad_norm_var": 0.008038075764973958, "learning_rate": 0.0001, "loss": 4.1255, "loss/crossentropy": 2.3768097162246704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20385072380304337, "step": 26930 }, { "epoch": 0.53864, "grad_norm": 1.8203125, "grad_norm_var": 0.008097330729166666, "learning_rate": 0.0001, "loss": 3.6845, "loss/crossentropy": 1.8940687775611877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1758279949426651, "step": 26932 }, { "epoch": 0.53868, "grad_norm": 1.8515625, "grad_norm_var": 0.008503214518229166, "learning_rate": 0.0001, "loss": 4.1272, "loss/crossentropy": 2.258841872215271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18384167551994324, "step": 26934 }, { "epoch": 0.53872, "grad_norm": 1.8984375, "grad_norm_var": 0.010115305582682291, "learning_rate": 0.0001, "loss": 4.1708, "loss/crossentropy": 2.3876583576202393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20442500710487366, "step": 26936 }, { "epoch": 0.53876, "grad_norm": 1.9140625, "grad_norm_var": 0.010106404622395834, "learning_rate": 0.0001, "loss": 3.975, "loss/crossentropy": 2.3199750185012817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975177749991417, "step": 26938 }, { "epoch": 0.5388, "grad_norm": 1.8359375, "grad_norm_var": 0.006365712483723958, "learning_rate": 0.0001, "loss": 4.1408, "loss/crossentropy": 2.2277639508247375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962454915046692, "step": 26940 }, { "epoch": 0.53884, "grad_norm": 1.8984375, "grad_norm_var": 0.009844716389973958, "learning_rate": 0.0001, "loss": 4.1696, "loss/crossentropy": 2.2041778564453125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19852803647518158, "step": 26942 }, { "epoch": 0.53888, "grad_norm": 1.8359375, "grad_norm_var": 0.009924062093098958, "learning_rate": 0.0001, "loss": 3.844, "loss/crossentropy": 1.6400386095046997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15487553179264069, "step": 26944 }, { "epoch": 0.53892, "grad_norm": 1.8828125, "grad_norm_var": 0.012059529622395834, "learning_rate": 0.0001, "loss": 4.0689, "loss/crossentropy": 2.4118189811706543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22375066578388214, "step": 26946 }, { "epoch": 0.53896, "grad_norm": 1.921875, "grad_norm_var": 0.014253489176432292, "learning_rate": 0.0001, "loss": 3.932, "loss/crossentropy": 2.238221287727356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18950769305229187, "step": 26948 }, { "epoch": 0.539, "grad_norm": 1.765625, "grad_norm_var": 0.0158935546875, "learning_rate": 0.0001, "loss": 4.0809, "loss/crossentropy": 2.2348607182502747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920975297689438, "step": 26950 }, { "epoch": 0.53904, "grad_norm": 1.9453125, "grad_norm_var": 0.015954335530598957, "learning_rate": 0.0001, "loss": 4.0146, "loss/crossentropy": 2.2838883996009827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128254398703575, "step": 26952 }, { "epoch": 0.53908, "grad_norm": 1.953125, "grad_norm_var": 0.0150787353515625, "learning_rate": 0.0001, "loss": 3.7944, "loss/crossentropy": 2.183197498321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2030157521367073, "step": 26954 }, { "epoch": 0.53912, "grad_norm": 1.8984375, "grad_norm_var": 0.014579010009765626, "learning_rate": 0.0001, "loss": 4.1249, "loss/crossentropy": 2.132546067237854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2098417431116104, "step": 26956 }, { "epoch": 0.53916, "grad_norm": 1.953125, "grad_norm_var": 0.011839803059895833, "learning_rate": 0.0001, "loss": 3.9566, "loss/crossentropy": 1.867222011089325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18416466563940048, "step": 26958 }, { "epoch": 0.5392, "grad_norm": 1.875, "grad_norm_var": 0.013710276285807291, "learning_rate": 0.0001, "loss": 4.3045, "loss/crossentropy": 2.0456870198249817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19690076261758804, "step": 26960 }, { "epoch": 0.53924, "grad_norm": 1.953125, "grad_norm_var": 0.017731730143229166, "learning_rate": 0.0001, "loss": 4.207, "loss/crossentropy": 2.387152314186096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2672813981771469, "step": 26962 }, { "epoch": 0.53928, "grad_norm": 1.8125, "grad_norm_var": 0.014679972330729167, "learning_rate": 0.0001, "loss": 3.8385, "loss/crossentropy": 1.7455761432647705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.158515103161335, "step": 26964 }, { "epoch": 0.53932, "grad_norm": 1.8203125, "grad_norm_var": 0.013643391927083333, "learning_rate": 0.0001, "loss": 4.0564, "loss/crossentropy": 2.0408239364624023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18679992854595184, "step": 26966 }, { "epoch": 0.53936, "grad_norm": 2.015625, "grad_norm_var": 0.012555948893229167, "learning_rate": 0.0001, "loss": 4.1616, "loss/crossentropy": 2.1810312271118164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18764056265354156, "step": 26968 }, { "epoch": 0.5394, "grad_norm": 1.9609375, "grad_norm_var": 0.012015787760416667, "learning_rate": 0.0001, "loss": 3.9197, "loss/crossentropy": 2.152927279472351, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18063102662563324, "step": 26970 }, { "epoch": 0.53944, "grad_norm": 1.96875, "grad_norm_var": 0.01220703125, "learning_rate": 0.0001, "loss": 3.912, "loss/crossentropy": 2.006937623023987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17757008224725723, "step": 26972 }, { "epoch": 0.53948, "grad_norm": 1.8671875, "grad_norm_var": 0.01268310546875, "learning_rate": 0.0001, "loss": 4.0045, "loss/crossentropy": 1.7032560110092163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16759414970874786, "step": 26974 }, { "epoch": 0.53952, "grad_norm": 1.921875, "grad_norm_var": 0.014506022135416666, "learning_rate": 0.0001, "loss": 4.0903, "loss/crossentropy": 2.0685030817985535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18894075602293015, "step": 26976 }, { "epoch": 0.53956, "grad_norm": 1.8125, "grad_norm_var": 0.010015614827473958, "learning_rate": 0.0001, "loss": 3.5637, "loss/crossentropy": 1.7113747000694275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16482854634523392, "step": 26978 }, { "epoch": 0.5396, "grad_norm": 1.8515625, "grad_norm_var": 0.009208170572916667, "learning_rate": 0.0001, "loss": 3.8915, "loss/crossentropy": 1.9814255237579346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19485389441251755, "step": 26980 }, { "epoch": 0.53964, "grad_norm": 1.8203125, "grad_norm_var": 0.009466298421223958, "learning_rate": 0.0001, "loss": 3.7662, "loss/crossentropy": 1.825324296951294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.172153040766716, "step": 26982 }, { "epoch": 0.53968, "grad_norm": 1.9453125, "grad_norm_var": 0.008837636311848958, "learning_rate": 0.0001, "loss": 4.0153, "loss/crossentropy": 2.093530297279358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956939622759819, "step": 26984 }, { "epoch": 0.53972, "grad_norm": 1.9140625, "grad_norm_var": 0.010716756184895834, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 2.333137631416321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21523389220237732, "step": 26986 }, { "epoch": 0.53976, "grad_norm": 1.9296875, "grad_norm_var": 0.010814412434895834, "learning_rate": 0.0001, "loss": 3.8208, "loss/crossentropy": 1.6317270994186401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16613581776618958, "step": 26988 }, { "epoch": 0.5398, "grad_norm": 1.96875, "grad_norm_var": 0.010560862223307292, "learning_rate": 0.0001, "loss": 3.7877, "loss/crossentropy": 2.286810517311096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21348290145397186, "step": 26990 }, { "epoch": 0.53984, "grad_norm": 1.6875, "grad_norm_var": 0.008597819010416667, "learning_rate": 0.0001, "loss": 3.8261, "loss/crossentropy": 2.2280890941619873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20495467633008957, "step": 26992 }, { "epoch": 0.53988, "grad_norm": 1.8125, "grad_norm_var": 0.0084625244140625, "learning_rate": 0.0001, "loss": 3.7743, "loss/crossentropy": 1.4591253399848938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1521049663424492, "step": 26994 }, { "epoch": 0.53992, "grad_norm": 1.875, "grad_norm_var": 0.0085205078125, "learning_rate": 0.0001, "loss": 4.0559, "loss/crossentropy": 2.236023187637329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155672013759613, "step": 26996 }, { "epoch": 0.53996, "grad_norm": 2.078125, "grad_norm_var": 0.0094390869140625, "learning_rate": 0.0001, "loss": 4.1202, "loss/crossentropy": 2.0742298364639282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17755182832479477, "step": 26998 }, { "epoch": 0.54, "grad_norm": 1.8828125, "grad_norm_var": 0.009562174479166666, "learning_rate": 0.0001, "loss": 3.8773, "loss/crossentropy": 1.9153380990028381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838880032300949, "step": 27000 }, { "epoch": 0.54004, "grad_norm": 1.8359375, "grad_norm_var": 0.009349568684895834, "learning_rate": 0.0001, "loss": 4.062, "loss/crossentropy": 2.2510672211647034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2028658166527748, "step": 27002 }, { "epoch": 0.54008, "grad_norm": 1.75, "grad_norm_var": 0.011016591389973959, "learning_rate": 0.0001, "loss": 3.5163, "loss/crossentropy": 1.9110174179077148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18953163921833038, "step": 27004 }, { "epoch": 0.54012, "grad_norm": 1.8125, "grad_norm_var": 0.012604777018229167, "learning_rate": 0.0001, "loss": 3.9874, "loss/crossentropy": 1.8102795481681824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16844668984413147, "step": 27006 }, { "epoch": 0.54016, "grad_norm": 1.9921875, "grad_norm_var": 0.009797159830729167, "learning_rate": 0.0001, "loss": 4.1755, "loss/crossentropy": 1.7649898529052734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18175919353961945, "step": 27008 }, { "epoch": 0.5402, "grad_norm": 2.078125, "grad_norm_var": 0.011104075113932292, "learning_rate": 0.0001, "loss": 4.0751, "loss/crossentropy": 2.199475884437561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19509385526180267, "step": 27010 }, { "epoch": 0.54024, "grad_norm": 1.9765625, "grad_norm_var": 0.010935211181640625, "learning_rate": 0.0001, "loss": 4.1241, "loss/crossentropy": 2.313786506652832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032681107521057, "step": 27012 }, { "epoch": 0.54028, "grad_norm": 1.9921875, "grad_norm_var": 0.009992472330729167, "learning_rate": 0.0001, "loss": 4.1953, "loss/crossentropy": 2.040158271789551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20921558886766434, "step": 27014 }, { "epoch": 0.54032, "grad_norm": 1.875, "grad_norm_var": 0.010871378580729167, "learning_rate": 0.0001, "loss": 3.7967, "loss/crossentropy": 2.0283621549606323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19085510820150375, "step": 27016 }, { "epoch": 0.54036, "grad_norm": 1.9921875, "grad_norm_var": 0.015941365559895834, "learning_rate": 0.0001, "loss": 3.9703, "loss/crossentropy": 2.1031923294067383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19315888732671738, "step": 27018 }, { "epoch": 0.5404, "grad_norm": 1.8515625, "grad_norm_var": 0.013142903645833334, "learning_rate": 0.0001, "loss": 3.7665, "loss/crossentropy": 1.8264936804771423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17999880015850067, "step": 27020 }, { "epoch": 0.54044, "grad_norm": 1.9453125, "grad_norm_var": 0.011531321207682292, "learning_rate": 0.0001, "loss": 3.7928, "loss/crossentropy": 2.127643585205078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20216452330350876, "step": 27022 }, { "epoch": 0.54048, "grad_norm": 1.9375, "grad_norm_var": 0.0120849609375, "learning_rate": 0.0001, "loss": 3.9311, "loss/crossentropy": 1.8567208647727966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17688161879777908, "step": 27024 }, { "epoch": 0.54052, "grad_norm": 1.890625, "grad_norm_var": 0.0108062744140625, "learning_rate": 0.0001, "loss": 3.8285, "loss/crossentropy": 2.0056475400924683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16437891870737076, "step": 27026 }, { "epoch": 0.54056, "grad_norm": 1.875, "grad_norm_var": 0.012023671468098959, "learning_rate": 0.0001, "loss": 4.0068, "loss/crossentropy": 2.2776039838790894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21126240491867065, "step": 27028 }, { "epoch": 0.5406, "grad_norm": 1.9296875, "grad_norm_var": 0.012674967447916666, "learning_rate": 0.0001, "loss": 4.2638, "loss/crossentropy": 2.2637280225753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19725032150745392, "step": 27030 }, { "epoch": 0.54064, "grad_norm": 1.859375, "grad_norm_var": 0.011895497639973959, "learning_rate": 0.0001, "loss": 3.837, "loss/crossentropy": 1.6967254281044006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1950267255306244, "step": 27032 }, { "epoch": 0.54068, "grad_norm": 1.9140625, "grad_norm_var": 0.008591461181640624, "learning_rate": 0.0001, "loss": 4.3259, "loss/crossentropy": 2.2444742918014526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20126602053642273, "step": 27034 }, { "epoch": 0.54072, "grad_norm": 1.9140625, "grad_norm_var": 0.008656565348307292, "learning_rate": 0.0001, "loss": 3.8887, "loss/crossentropy": 2.216984808444977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18849068880081177, "step": 27036 }, { "epoch": 0.54076, "grad_norm": 1.90625, "grad_norm_var": 0.008296712239583334, "learning_rate": 0.0001, "loss": 3.9656, "loss/crossentropy": 2.089816451072693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2007404863834381, "step": 27038 }, { "epoch": 0.5408, "grad_norm": 1.8671875, "grad_norm_var": 0.0071370442708333336, "learning_rate": 0.0001, "loss": 3.7989, "loss/crossentropy": 1.839013695716858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17485745251178741, "step": 27040 }, { "epoch": 0.54084, "grad_norm": 2.0, "grad_norm_var": 0.007393391927083334, "learning_rate": 0.0001, "loss": 4.3015, "loss/crossentropy": 2.3915692567825317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21270561963319778, "step": 27042 }, { "epoch": 0.54088, "grad_norm": 2.046875, "grad_norm_var": 0.0087799072265625, "learning_rate": 0.0001, "loss": 3.8979, "loss/crossentropy": 2.2086989879608154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21337078511714935, "step": 27044 }, { "epoch": 0.54092, "grad_norm": 2.0, "grad_norm_var": 0.008971913655598959, "learning_rate": 0.0001, "loss": 4.241, "loss/crossentropy": 1.8932998776435852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1783016324043274, "step": 27046 }, { "epoch": 0.54096, "grad_norm": 1.875, "grad_norm_var": 0.008479563395182292, "learning_rate": 0.0001, "loss": 3.9551, "loss/crossentropy": 1.9314876198768616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857961043715477, "step": 27048 }, { "epoch": 0.541, "grad_norm": 1.828125, "grad_norm_var": 0.0074371337890625, "learning_rate": 0.0001, "loss": 3.875, "loss/crossentropy": 1.616044044494629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16122639179229736, "step": 27050 }, { "epoch": 0.54104, "grad_norm": 1.84375, "grad_norm_var": 0.007572174072265625, "learning_rate": 0.0001, "loss": 3.9375, "loss/crossentropy": 1.7567220330238342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18351546674966812, "step": 27052 }, { "epoch": 0.54108, "grad_norm": 1.8984375, "grad_norm_var": 0.007350413004557291, "learning_rate": 0.0001, "loss": 3.9276, "loss/crossentropy": 1.8721659779548645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19143584370613098, "step": 27054 }, { "epoch": 0.54112, "grad_norm": 1.953125, "grad_norm_var": 0.0079254150390625, "learning_rate": 0.0001, "loss": 3.8402, "loss/crossentropy": 1.8633847832679749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18027572333812714, "step": 27056 }, { "epoch": 0.54116, "grad_norm": 1.859375, "grad_norm_var": 0.008455149332682292, "learning_rate": 0.0001, "loss": 4.0355, "loss/crossentropy": 2.0588165521621704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19255611300468445, "step": 27058 }, { "epoch": 0.5412, "grad_norm": 1.9140625, "grad_norm_var": 0.0056304931640625, "learning_rate": 0.0001, "loss": 4.1375, "loss/crossentropy": 2.316554546356201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2137911319732666, "step": 27060 }, { "epoch": 0.54124, "grad_norm": 1.8671875, "grad_norm_var": 0.0047910054524739586, "learning_rate": 0.0001, "loss": 4.0224, "loss/crossentropy": 2.1319636702537537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21494153887033463, "step": 27062 }, { "epoch": 0.54128, "grad_norm": 1.84375, "grad_norm_var": 0.005326334635416667, "learning_rate": 0.0001, "loss": 3.8054, "loss/crossentropy": 1.9028087854385376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869627833366394, "step": 27064 }, { "epoch": 0.54132, "grad_norm": 1.875, "grad_norm_var": 0.0044097900390625, "learning_rate": 0.0001, "loss": 3.9521, "loss/crossentropy": 2.0914413928985596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090655192732811, "step": 27066 }, { "epoch": 0.54136, "grad_norm": 1.8515625, "grad_norm_var": 0.004367828369140625, "learning_rate": 0.0001, "loss": 3.9667, "loss/crossentropy": 2.1949799060821533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19384562969207764, "step": 27068 }, { "epoch": 0.5414, "grad_norm": 1.921875, "grad_norm_var": 0.0045562744140625, "learning_rate": 0.0001, "loss": 4.0931, "loss/crossentropy": 2.076161026954651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17761556804180145, "step": 27070 }, { "epoch": 0.54144, "grad_norm": 1.8828125, "grad_norm_var": 0.004874674479166666, "learning_rate": 0.0001, "loss": 3.9357, "loss/crossentropy": 2.0287702679634094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18184736371040344, "step": 27072 }, { "epoch": 0.54148, "grad_norm": 1.84375, "grad_norm_var": 0.004914347330729167, "learning_rate": 0.0001, "loss": 3.7579, "loss/crossentropy": 2.2093619108200073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20473447442054749, "step": 27074 }, { "epoch": 0.54152, "grad_norm": 1.921875, "grad_norm_var": 0.004646809895833334, "learning_rate": 0.0001, "loss": 4.0884, "loss/crossentropy": 2.076124429702759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19069521129131317, "step": 27076 }, { "epoch": 0.54156, "grad_norm": 1.78125, "grad_norm_var": 0.004303995768229167, "learning_rate": 0.0001, "loss": 3.7931, "loss/crossentropy": 2.014284610748291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18654431402683258, "step": 27078 }, { "epoch": 0.5416, "grad_norm": 1.828125, "grad_norm_var": 0.0044329325358072914, "learning_rate": 0.0001, "loss": 4.0599, "loss/crossentropy": 2.0898516178131104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19269923120737076, "step": 27080 }, { "epoch": 0.54164, "grad_norm": 1.9765625, "grad_norm_var": 0.015819295247395834, "learning_rate": 0.0001, "loss": 4.4848, "loss/crossentropy": 2.1016663908958435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058946192264557, "step": 27082 }, { "epoch": 0.54168, "grad_norm": 2.171875, "grad_norm_var": 0.020896148681640626, "learning_rate": 0.0001, "loss": 3.8787, "loss/crossentropy": 1.6725799441337585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1853618249297142, "step": 27084 }, { "epoch": 0.54172, "grad_norm": 1.9375, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 4.1032, "loss/crossentropy": 2.414761781692505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319842129945755, "step": 27086 }, { "epoch": 0.54176, "grad_norm": 1.890625, "grad_norm_var": 0.018344879150390625, "learning_rate": 0.0001, "loss": 4.0617, "loss/crossentropy": 1.887232780456543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1807008758187294, "step": 27088 }, { "epoch": 0.5418, "grad_norm": 1.828125, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 4.3199, "loss/crossentropy": 2.1659968495368958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109128087759018, "step": 27090 }, { "epoch": 0.54184, "grad_norm": 2.109375, "grad_norm_var": 0.02069269816080729, "learning_rate": 0.0001, "loss": 4.3623, "loss/crossentropy": 2.134926438331604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200565405189991, "step": 27092 }, { "epoch": 0.54188, "grad_norm": 2.03125, "grad_norm_var": 0.016621907552083332, "learning_rate": 0.0001, "loss": 4.3347, "loss/crossentropy": 2.3204580545425415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21586615592241287, "step": 27094 }, { "epoch": 0.54192, "grad_norm": 1.8203125, "grad_norm_var": 0.016462198893229165, "learning_rate": 0.0001, "loss": 3.8415, "loss/crossentropy": 2.1035609245300293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20287606865167618, "step": 27096 }, { "epoch": 0.54196, "grad_norm": 1.96875, "grad_norm_var": 0.0115631103515625, "learning_rate": 0.0001, "loss": 4.0134, "loss/crossentropy": 1.9996371865272522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981046125292778, "step": 27098 }, { "epoch": 0.542, "grad_norm": 2.171875, "grad_norm_var": 0.011220041910807292, "learning_rate": 0.0001, "loss": 4.4227, "loss/crossentropy": 2.2769532203674316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19060392677783966, "step": 27100 }, { "epoch": 0.54204, "grad_norm": 1.765625, "grad_norm_var": 0.014684804280598958, "learning_rate": 0.0001, "loss": 3.92, "loss/crossentropy": 1.8792597651481628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17460587620735168, "step": 27102 }, { "epoch": 0.54208, "grad_norm": 1.8125, "grad_norm_var": 0.01631647745768229, "learning_rate": 0.0001, "loss": 3.9027, "loss/crossentropy": 1.7762881517410278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15890707820653915, "step": 27104 }, { "epoch": 0.54212, "grad_norm": 2.03125, "grad_norm_var": 0.016196441650390626, "learning_rate": 0.0001, "loss": 3.9892, "loss/crossentropy": 2.125525116920471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22518931329250336, "step": 27106 }, { "epoch": 0.54216, "grad_norm": 1.921875, "grad_norm_var": 0.012033843994140625, "learning_rate": 0.0001, "loss": 3.7171, "loss/crossentropy": 2.1541160345077515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1856529340147972, "step": 27108 }, { "epoch": 0.5422, "grad_norm": 1.90625, "grad_norm_var": 0.010927073160807292, "learning_rate": 0.0001, "loss": 4.0158, "loss/crossentropy": 1.8441925048828125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19040126353502274, "step": 27110 }, { "epoch": 0.54224, "grad_norm": 1.8984375, "grad_norm_var": 0.011260732014973959, "learning_rate": 0.0001, "loss": 3.9886, "loss/crossentropy": 1.9131816625595093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19395362585783005, "step": 27112 }, { "epoch": 0.54228, "grad_norm": 1.8359375, "grad_norm_var": 0.015254465738932292, "learning_rate": 0.0001, "loss": 4.0498, "loss/crossentropy": 1.9133941531181335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23238787055015564, "step": 27114 }, { "epoch": 0.54232, "grad_norm": 1.9453125, "grad_norm_var": 0.011253865559895833, "learning_rate": 0.0001, "loss": 4.1971, "loss/crossentropy": 2.2706873416900635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21193598210811615, "step": 27116 }, { "epoch": 0.54236, "grad_norm": 1.9765625, "grad_norm_var": 0.009822336832682292, "learning_rate": 0.0001, "loss": 3.9732, "loss/crossentropy": 1.7896053791046143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1722758710384369, "step": 27118 }, { "epoch": 0.5424, "grad_norm": 2.015625, "grad_norm_var": 0.012636057535807292, "learning_rate": 0.0001, "loss": 4.3074, "loss/crossentropy": 2.3640060424804688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21565549075603485, "step": 27120 }, { "epoch": 0.54244, "grad_norm": 1.828125, "grad_norm_var": 0.012857818603515625, "learning_rate": 0.0001, "loss": 3.8032, "loss/crossentropy": 1.6995807886123657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15929770469665527, "step": 27122 }, { "epoch": 0.54248, "grad_norm": 1.9296875, "grad_norm_var": 0.012607574462890625, "learning_rate": 0.0001, "loss": 3.9518, "loss/crossentropy": 1.6559955477714539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741805449128151, "step": 27124 }, { "epoch": 0.54252, "grad_norm": 1.921875, "grad_norm_var": 0.012451171875, "learning_rate": 0.0001, "loss": 3.8393, "loss/crossentropy": 1.9256713390350342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23186514526605606, "step": 27126 }, { "epoch": 0.54256, "grad_norm": 1.953125, "grad_norm_var": 0.011486562093098958, "learning_rate": 0.0001, "loss": 3.8599, "loss/crossentropy": 2.0060980319976807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20487740635871887, "step": 27128 }, { "epoch": 0.5426, "grad_norm": 1.90625, "grad_norm_var": 0.009545644124348959, "learning_rate": 0.0001, "loss": 3.6376, "loss/crossentropy": 1.8419195413589478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17364958673715591, "step": 27130 }, { "epoch": 0.54264, "grad_norm": 1.9765625, "grad_norm_var": 0.0100250244140625, "learning_rate": 0.0001, "loss": 4.1161, "loss/crossentropy": 1.9890483617782593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21333064138889313, "step": 27132 }, { "epoch": 0.54268, "grad_norm": 2.015625, "grad_norm_var": 0.009639485677083334, "learning_rate": 0.0001, "loss": 4.1288, "loss/crossentropy": 2.2758660316467285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21215695142745972, "step": 27134 }, { "epoch": 0.54272, "grad_norm": 1.9375, "grad_norm_var": 0.006086985270182292, "learning_rate": 0.0001, "loss": 4.1358, "loss/crossentropy": 2.273390769958496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20087362825870514, "step": 27136 }, { "epoch": 0.54276, "grad_norm": 1.796875, "grad_norm_var": 0.005322265625, "learning_rate": 0.0001, "loss": 4.0575, "loss/crossentropy": 2.446821928024292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775765180587769, "step": 27138 }, { "epoch": 0.5428, "grad_norm": 1.875, "grad_norm_var": 0.006449127197265625, "learning_rate": 0.0001, "loss": 4.0756, "loss/crossentropy": 2.391844630241394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2062644362449646, "step": 27140 }, { "epoch": 0.54284, "grad_norm": 1.9375, "grad_norm_var": 0.007216135660807292, "learning_rate": 0.0001, "loss": 3.9436, "loss/crossentropy": 2.112763822078705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17983781546354294, "step": 27142 }, { "epoch": 0.54288, "grad_norm": 1.921875, "grad_norm_var": 0.006624094645182292, "learning_rate": 0.0001, "loss": 4.2086, "loss/crossentropy": 2.184209704399109, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20391154289245605, "step": 27144 }, { "epoch": 0.54292, "grad_norm": 1.8671875, "grad_norm_var": 0.005806477864583334, "learning_rate": 0.0001, "loss": 3.9503, "loss/crossentropy": 1.911209523677826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19172395020723343, "step": 27146 }, { "epoch": 0.54296, "grad_norm": 1.9609375, "grad_norm_var": 0.00419921875, "learning_rate": 0.0001, "loss": 3.9281, "loss/crossentropy": 2.132554292678833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887652799487114, "step": 27148 }, { "epoch": 0.543, "grad_norm": 1.9296875, "grad_norm_var": 0.010130818684895833, "learning_rate": 0.0001, "loss": 4.1577, "loss/crossentropy": 2.4312938451766968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20902784168720245, "step": 27150 }, { "epoch": 0.54304, "grad_norm": 1.7578125, "grad_norm_var": 0.011909993489583333, "learning_rate": 0.0001, "loss": 4.088, "loss/crossentropy": 1.857543170452118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920638307929039, "step": 27152 }, { "epoch": 0.54308, "grad_norm": 1.953125, "grad_norm_var": 0.01109619140625, "learning_rate": 0.0001, "loss": 4.035, "loss/crossentropy": 1.8170040845870972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1604015827178955, "step": 27154 }, { "epoch": 0.54312, "grad_norm": 2.015625, "grad_norm_var": 0.010815175374348958, "learning_rate": 0.0001, "loss": 3.9593, "loss/crossentropy": 1.8490120768547058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19839002192020416, "step": 27156 }, { "epoch": 0.54316, "grad_norm": 1.9140625, "grad_norm_var": 0.009954579671223958, "learning_rate": 0.0001, "loss": 3.9611, "loss/crossentropy": 2.1401009559631348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20167674124240875, "step": 27158 }, { "epoch": 0.5432, "grad_norm": 2.03125, "grad_norm_var": 0.0108306884765625, "learning_rate": 0.0001, "loss": 4.1829, "loss/crossentropy": 2.216339647769928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046881765127182, "step": 27160 }, { "epoch": 0.54324, "grad_norm": 1.8359375, "grad_norm_var": 0.011112213134765625, "learning_rate": 0.0001, "loss": 3.9202, "loss/crossentropy": 2.2559473514556885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20772842317819595, "step": 27162 }, { "epoch": 0.54328, "grad_norm": 1.9375, "grad_norm_var": 0.010811360677083333, "learning_rate": 0.0001, "loss": 4.0261, "loss/crossentropy": 1.9810506105422974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18164421617984772, "step": 27164 }, { "epoch": 0.54332, "grad_norm": 1.9453125, "grad_norm_var": 0.0047271728515625, "learning_rate": 0.0001, "loss": 3.9554, "loss/crossentropy": 2.2585933208465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20920690149068832, "step": 27166 }, { "epoch": 0.54336, "grad_norm": 1.75, "grad_norm_var": 0.005524698893229167, "learning_rate": 0.0001, "loss": 3.6078, "loss/crossentropy": 1.8078173995018005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16720515489578247, "step": 27168 }, { "epoch": 0.5434, "grad_norm": 1.828125, "grad_norm_var": 0.005678049723307292, "learning_rate": 0.0001, "loss": 3.9103, "loss/crossentropy": 1.7529219388961792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17757199704647064, "step": 27170 }, { "epoch": 0.54344, "grad_norm": 1.875, "grad_norm_var": 0.004713694254557292, "learning_rate": 0.0001, "loss": 3.8833, "loss/crossentropy": 2.0651434659957886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2342069447040558, "step": 27172 }, { "epoch": 0.54348, "grad_norm": 1.984375, "grad_norm_var": 0.005125935872395833, "learning_rate": 0.0001, "loss": 3.7693, "loss/crossentropy": 1.7311297059059143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17116959393024445, "step": 27174 }, { "epoch": 0.54352, "grad_norm": 1.8125, "grad_norm_var": 0.0042803446451822914, "learning_rate": 0.0001, "loss": 3.9136, "loss/crossentropy": 1.65495365858078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756131500005722, "step": 27176 }, { "epoch": 0.54356, "grad_norm": 1.8359375, "grad_norm_var": 0.0039713541666666664, "learning_rate": 0.0001, "loss": 3.8718, "loss/crossentropy": 2.1134061217308044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19070632755756378, "step": 27178 }, { "epoch": 0.5436, "grad_norm": 1.8984375, "grad_norm_var": 0.0037261962890625, "learning_rate": 0.0001, "loss": 3.8315, "loss/crossentropy": 1.8080366849899292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1905672550201416, "step": 27180 }, { "epoch": 0.54364, "grad_norm": 2.0, "grad_norm_var": 0.004550933837890625, "learning_rate": 0.0001, "loss": 4.1646, "loss/crossentropy": 2.1257325410842896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20127511024475098, "step": 27182 }, { "epoch": 0.54368, "grad_norm": 1.9375, "grad_norm_var": 0.0030263264973958335, "learning_rate": 0.0001, "loss": 4.2156, "loss/crossentropy": 2.3125778436660767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088162064552307, "step": 27184 }, { "epoch": 0.54372, "grad_norm": 2.125, "grad_norm_var": 0.005777740478515625, "learning_rate": 0.0001, "loss": 4.14, "loss/crossentropy": 2.21799373626709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073218673467636, "step": 27186 }, { "epoch": 0.54376, "grad_norm": 1.90625, "grad_norm_var": 0.005686187744140625, "learning_rate": 0.0001, "loss": 3.8577, "loss/crossentropy": 1.8872935771942139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17742664366960526, "step": 27188 }, { "epoch": 0.5438, "grad_norm": 1.9375, "grad_norm_var": 0.0054433186848958336, "learning_rate": 0.0001, "loss": 3.7985, "loss/crossentropy": 1.6956111788749695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17246189713478088, "step": 27190 }, { "epoch": 0.54384, "grad_norm": 2.03125, "grad_norm_var": 0.0057065327962239586, "learning_rate": 0.0001, "loss": 4.0475, "loss/crossentropy": 2.0606382489204407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049379125237465, "step": 27192 }, { "epoch": 0.54388, "grad_norm": 1.8828125, "grad_norm_var": 0.005092112223307291, "learning_rate": 0.0001, "loss": 3.9601, "loss/crossentropy": 2.1031752824783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19281570613384247, "step": 27194 }, { "epoch": 0.54392, "grad_norm": 1.8125, "grad_norm_var": 0.006306711832682292, "learning_rate": 0.0001, "loss": 4.1294, "loss/crossentropy": 2.113471269607544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17625518888235092, "step": 27196 }, { "epoch": 0.54396, "grad_norm": 1.90625, "grad_norm_var": 0.0061279296875, "learning_rate": 0.0001, "loss": 4.1399, "loss/crossentropy": 2.0233633518218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19023866951465607, "step": 27198 }, { "epoch": 0.544, "grad_norm": 2.0, "grad_norm_var": 0.006453196207682292, "learning_rate": 0.0001, "loss": 4.0167, "loss/crossentropy": 2.1072750091552734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18891653418540955, "step": 27200 }, { "epoch": 0.54404, "grad_norm": 2.265625, "grad_norm_var": 0.011250559488932292, "learning_rate": 0.0001, "loss": 4.0871, "loss/crossentropy": 2.1586121320724487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19431254267692566, "step": 27202 }, { "epoch": 0.54408, "grad_norm": 1.9140625, "grad_norm_var": 0.01123046875, "learning_rate": 0.0001, "loss": 3.9215, "loss/crossentropy": 1.817845344543457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17477234452962875, "step": 27204 }, { "epoch": 0.54412, "grad_norm": 1.796875, "grad_norm_var": 0.0123687744140625, "learning_rate": 0.0001, "loss": 3.7602, "loss/crossentropy": 2.0410149693489075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19700051844120026, "step": 27206 }, { "epoch": 0.54416, "grad_norm": 1.9375, "grad_norm_var": 0.011563873291015625, "learning_rate": 0.0001, "loss": 4.1133, "loss/crossentropy": 2.096368670463562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20101771503686905, "step": 27208 }, { "epoch": 0.5442, "grad_norm": 1.96875, "grad_norm_var": 0.013061269124348959, "learning_rate": 0.0001, "loss": 4.0634, "loss/crossentropy": 2.3180510997772217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23763899505138397, "step": 27210 }, { "epoch": 0.54424, "grad_norm": 1.96875, "grad_norm_var": 0.011986287434895833, "learning_rate": 0.0001, "loss": 4.0617, "loss/crossentropy": 2.2700339555740356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21923137456178665, "step": 27212 }, { "epoch": 0.54428, "grad_norm": 1.9765625, "grad_norm_var": 0.011659495035807292, "learning_rate": 0.0001, "loss": 3.8744, "loss/crossentropy": 2.0254225730895996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879374161362648, "step": 27214 }, { "epoch": 0.54432, "grad_norm": 1.8125, "grad_norm_var": 0.012401326497395834, "learning_rate": 0.0001, "loss": 4.0095, "loss/crossentropy": 2.0617589950561523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20201390981674194, "step": 27216 }, { "epoch": 0.54436, "grad_norm": 1.9375, "grad_norm_var": 0.005147298177083333, "learning_rate": 0.0001, "loss": 4.1374, "loss/crossentropy": 2.1432749032974243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21799273788928986, "step": 27218 }, { "epoch": 0.5444, "grad_norm": 2.015625, "grad_norm_var": 0.005622355143229166, "learning_rate": 0.0001, "loss": 4.2461, "loss/crossentropy": 1.948439359664917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1876726821064949, "step": 27220 }, { "epoch": 0.54444, "grad_norm": 2.015625, "grad_norm_var": 0.004518381754557292, "learning_rate": 0.0001, "loss": 4.0501, "loss/crossentropy": 2.2085187435150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106829285621643, "step": 27222 }, { "epoch": 0.54448, "grad_norm": 2.0, "grad_norm_var": 0.005475870768229167, "learning_rate": 0.0001, "loss": 3.8899, "loss/crossentropy": 2.0642993450164795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19151441007852554, "step": 27224 }, { "epoch": 0.54452, "grad_norm": 2.125, "grad_norm_var": 0.009427897135416667, "learning_rate": 0.0001, "loss": 3.8693, "loss/crossentropy": 1.8351057767868042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17997722327709198, "step": 27226 }, { "epoch": 0.54456, "grad_norm": 1.8671875, "grad_norm_var": 0.009903971354166667, "learning_rate": 0.0001, "loss": 3.8896, "loss/crossentropy": 2.033097803592682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18373706936836243, "step": 27228 }, { "epoch": 0.5446, "grad_norm": 1.890625, "grad_norm_var": 0.009643300374348959, "learning_rate": 0.0001, "loss": 4.076, "loss/crossentropy": 1.9884595274925232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19420110434293747, "step": 27230 }, { "epoch": 0.54464, "grad_norm": 1.9765625, "grad_norm_var": 0.009258778889973958, "learning_rate": 0.0001, "loss": 3.7903, "loss/crossentropy": 1.9873828887939453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19645929336547852, "step": 27232 }, { "epoch": 0.54468, "grad_norm": 1.828125, "grad_norm_var": 0.010298411051432291, "learning_rate": 0.0001, "loss": 3.8454, "loss/crossentropy": 1.9592986702919006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20220571756362915, "step": 27234 }, { "epoch": 0.54472, "grad_norm": 1.890625, "grad_norm_var": 0.009480794270833334, "learning_rate": 0.0001, "loss": 3.8467, "loss/crossentropy": 2.0207183957099915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18422196060419083, "step": 27236 }, { "epoch": 0.54476, "grad_norm": 1.7578125, "grad_norm_var": 0.009740193684895834, "learning_rate": 0.0001, "loss": 3.753, "loss/crossentropy": 1.8570082187652588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17869313061237335, "step": 27238 }, { "epoch": 0.5448, "grad_norm": 1.8046875, "grad_norm_var": 0.008882649739583333, "learning_rate": 0.0001, "loss": 3.7697, "loss/crossentropy": 2.043560802936554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18643952161073685, "step": 27240 }, { "epoch": 0.54484, "grad_norm": 2.046875, "grad_norm_var": 0.0052874247233072914, "learning_rate": 0.0001, "loss": 4.0459, "loss/crossentropy": 2.2474546432495117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2210846170783043, "step": 27242 }, { "epoch": 0.54488, "grad_norm": 1.984375, "grad_norm_var": 0.005882771809895834, "learning_rate": 0.0001, "loss": 4.0702, "loss/crossentropy": 2.4019227027893066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20284153521060944, "step": 27244 }, { "epoch": 0.54492, "grad_norm": 1.8984375, "grad_norm_var": 0.006367746988932292, "learning_rate": 0.0001, "loss": 3.9849, "loss/crossentropy": 2.4402183294296265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2066793590784073, "step": 27246 }, { "epoch": 0.54496, "grad_norm": 1.7734375, "grad_norm_var": 0.006623331705729167, "learning_rate": 0.0001, "loss": 3.7691, "loss/crossentropy": 1.6989346742630005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1731812208890915, "step": 27248 }, { "epoch": 0.545, "grad_norm": 1.7890625, "grad_norm_var": 0.007126617431640625, "learning_rate": 0.0001, "loss": 3.7411, "loss/crossentropy": 1.658067524433136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17113492637872696, "step": 27250 }, { "epoch": 0.54504, "grad_norm": 1.9140625, "grad_norm_var": 0.007287343343098958, "learning_rate": 0.0001, "loss": 4.0801, "loss/crossentropy": 2.032026529312134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18906760960817337, "step": 27252 }, { "epoch": 0.54508, "grad_norm": 1.9453125, "grad_norm_var": 0.0065185546875, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 2.0466246008872986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963697373867035, "step": 27254 }, { "epoch": 0.54512, "grad_norm": 1.7890625, "grad_norm_var": 0.006906890869140625, "learning_rate": 0.0001, "loss": 4.1727, "loss/crossentropy": 2.199449300765991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19188550859689713, "step": 27256 }, { "epoch": 0.54516, "grad_norm": 1.875, "grad_norm_var": 0.007542928059895833, "learning_rate": 0.0001, "loss": 4.1578, "loss/crossentropy": 2.10872745513916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20023853331804276, "step": 27258 }, { "epoch": 0.5452, "grad_norm": 1.9453125, "grad_norm_var": 0.007010904947916666, "learning_rate": 0.0001, "loss": 4.1923, "loss/crossentropy": 2.2712149620056152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19052613526582718, "step": 27260 }, { "epoch": 0.54524, "grad_norm": 1.953125, "grad_norm_var": 0.008156077067057291, "learning_rate": 0.0001, "loss": 3.6014, "loss/crossentropy": 1.61397385597229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.155337356030941, "step": 27262 }, { "epoch": 0.54528, "grad_norm": 2.0, "grad_norm_var": 0.008194986979166667, "learning_rate": 0.0001, "loss": 4.1433, "loss/crossentropy": 1.8988584876060486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17318273335695267, "step": 27264 }, { "epoch": 0.54532, "grad_norm": 1.796875, "grad_norm_var": 0.007298787434895833, "learning_rate": 0.0001, "loss": 4.1592, "loss/crossentropy": 1.9996931552886963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18785406649112701, "step": 27266 }, { "epoch": 0.54536, "grad_norm": 1.96875, "grad_norm_var": 0.007462565104166667, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 2.1527179479599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20097015798091888, "step": 27268 }, { "epoch": 0.5454, "grad_norm": 2.03125, "grad_norm_var": 0.008609771728515625, "learning_rate": 0.0001, "loss": 4.051, "loss/crossentropy": 2.1786444187164307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20732732862234116, "step": 27270 }, { "epoch": 0.54544, "grad_norm": 1.875, "grad_norm_var": 0.007673136393229167, "learning_rate": 0.0001, "loss": 3.8147, "loss/crossentropy": 1.8255316019058228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18340704590082169, "step": 27272 }, { "epoch": 0.54548, "grad_norm": 2.0625, "grad_norm_var": 0.008046213785807292, "learning_rate": 0.0001, "loss": 4.0119, "loss/crossentropy": 2.1311771273612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206636942923069, "step": 27274 }, { "epoch": 0.54552, "grad_norm": 1.7265625, "grad_norm_var": 0.010113271077473958, "learning_rate": 0.0001, "loss": 3.752, "loss/crossentropy": 1.7570669651031494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15514526516199112, "step": 27276 }, { "epoch": 0.54556, "grad_norm": 1.984375, "grad_norm_var": 0.00963134765625, "learning_rate": 0.0001, "loss": 4.3659, "loss/crossentropy": 2.407896041870117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22130534052848816, "step": 27278 }, { "epoch": 0.5456, "grad_norm": 1.921875, "grad_norm_var": 0.009891764322916666, "learning_rate": 0.0001, "loss": 4.1536, "loss/crossentropy": 2.1037758588790894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23098163306713104, "step": 27280 }, { "epoch": 0.54564, "grad_norm": 1.875, "grad_norm_var": 0.008847808837890625, "learning_rate": 0.0001, "loss": 4.1011, "loss/crossentropy": 2.213571786880493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210727259516716, "step": 27282 }, { "epoch": 0.54568, "grad_norm": 1.828125, "grad_norm_var": 0.00955810546875, "learning_rate": 0.0001, "loss": 3.997, "loss/crossentropy": 1.9279372096061707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19715701043605804, "step": 27284 }, { "epoch": 0.54572, "grad_norm": 1.9296875, "grad_norm_var": 0.009372711181640625, "learning_rate": 0.0001, "loss": 4.0212, "loss/crossentropy": 2.257995128631592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844868570566177, "step": 27286 }, { "epoch": 0.54576, "grad_norm": 1.8359375, "grad_norm_var": 0.009694163004557292, "learning_rate": 0.0001, "loss": 3.8984, "loss/crossentropy": 1.7289800643920898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1757895052433014, "step": 27288 }, { "epoch": 0.5458, "grad_norm": 2.0, "grad_norm_var": 0.009930165608723958, "learning_rate": 0.0001, "loss": 3.767, "loss/crossentropy": 1.9156250953674316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17408546805381775, "step": 27290 }, { "epoch": 0.54584, "grad_norm": 1.96875, "grad_norm_var": 0.007330067952473958, "learning_rate": 0.0001, "loss": 4.153, "loss/crossentropy": 2.2218767404556274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20564305782318115, "step": 27292 }, { "epoch": 0.54588, "grad_norm": 1.8828125, "grad_norm_var": 0.007767740885416667, "learning_rate": 0.0001, "loss": 4.3436, "loss/crossentropy": 2.2322014570236206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19707325100898743, "step": 27294 }, { "epoch": 0.54592, "grad_norm": 1.8359375, "grad_norm_var": 0.007575480143229166, "learning_rate": 0.0001, "loss": 3.9286, "loss/crossentropy": 1.9559943675994873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009485438466072, "step": 27296 }, { "epoch": 0.54596, "grad_norm": 1.921875, "grad_norm_var": 0.007496897379557292, "learning_rate": 0.0001, "loss": 3.8224, "loss/crossentropy": 2.322990119457245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21545371413230896, "step": 27298 }, { "epoch": 0.546, "grad_norm": 1.8359375, "grad_norm_var": 0.007698567708333334, "learning_rate": 0.0001, "loss": 3.7273, "loss/crossentropy": 1.7752234935760498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18764031678438187, "step": 27300 }, { "epoch": 0.54604, "grad_norm": 1.8984375, "grad_norm_var": 0.008317057291666667, "learning_rate": 0.0001, "loss": 4.1671, "loss/crossentropy": 2.2797446250915527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20618843287229538, "step": 27302 }, { "epoch": 0.54608, "grad_norm": 1.953125, "grad_norm_var": 0.009468332926432291, "learning_rate": 0.0001, "loss": 4.0499, "loss/crossentropy": 2.1392215490341187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19076371937990189, "step": 27304 }, { "epoch": 0.54612, "grad_norm": 1.9296875, "grad_norm_var": 0.006852213541666667, "learning_rate": 0.0001, "loss": 3.9362, "loss/crossentropy": 2.065139055252075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20819596201181412, "step": 27306 }, { "epoch": 0.54616, "grad_norm": 1.8984375, "grad_norm_var": 0.007281239827473958, "learning_rate": 0.0001, "loss": 3.8854, "loss/crossentropy": 1.90589839220047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18194814771413803, "step": 27308 }, { "epoch": 0.5462, "grad_norm": 1.8046875, "grad_norm_var": 0.0071489969889322914, "learning_rate": 0.0001, "loss": 4.0056, "loss/crossentropy": 2.18922221660614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18913178890943527, "step": 27310 }, { "epoch": 0.54624, "grad_norm": 1.90625, "grad_norm_var": 0.006357574462890625, "learning_rate": 0.0001, "loss": 3.9098, "loss/crossentropy": 2.3256269693374634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20674752444028854, "step": 27312 }, { "epoch": 0.54628, "grad_norm": 2.015625, "grad_norm_var": 0.006859334309895834, "learning_rate": 0.0001, "loss": 3.8007, "loss/crossentropy": 2.2145326733589172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18575318902730942, "step": 27314 }, { "epoch": 0.54632, "grad_norm": 1.953125, "grad_norm_var": 0.0065419514973958336, "learning_rate": 0.0001, "loss": 4.27, "loss/crossentropy": 2.326914429664612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115549072623253, "step": 27316 }, { "epoch": 0.54636, "grad_norm": 1.8671875, "grad_norm_var": 0.006551106770833333, "learning_rate": 0.0001, "loss": 4.1064, "loss/crossentropy": 2.1162266731262207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951146125793457, "step": 27318 }, { "epoch": 0.5464, "grad_norm": 1.8671875, "grad_norm_var": 0.027858225504557292, "learning_rate": 0.0001, "loss": 3.929, "loss/crossentropy": 1.96768057346344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19022534787654877, "step": 27320 }, { "epoch": 0.54644, "grad_norm": 1.859375, "grad_norm_var": 0.028368123372395835, "learning_rate": 0.0001, "loss": 3.7685, "loss/crossentropy": 1.6591554880142212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16152790188789368, "step": 27322 }, { "epoch": 0.54648, "grad_norm": 1.734375, "grad_norm_var": 0.030863444010416668, "learning_rate": 0.0001, "loss": 3.8354, "loss/crossentropy": 2.3317655324935913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20369040220975876, "step": 27324 }, { "epoch": 0.54652, "grad_norm": 1.84375, "grad_norm_var": 0.032871246337890625, "learning_rate": 0.0001, "loss": 3.808, "loss/crossentropy": 1.9729547500610352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18059556931257248, "step": 27326 }, { "epoch": 0.54656, "grad_norm": 1.9140625, "grad_norm_var": 0.03397216796875, "learning_rate": 0.0001, "loss": 3.9546, "loss/crossentropy": 2.0512842535972595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19131098687648773, "step": 27328 }, { "epoch": 0.5466, "grad_norm": 2.125, "grad_norm_var": 0.0363433837890625, "learning_rate": 0.0001, "loss": 4.1688, "loss/crossentropy": 2.385029435157776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064640149474144, "step": 27330 }, { "epoch": 0.54664, "grad_norm": 2.0, "grad_norm_var": 0.0443603515625, "learning_rate": 0.0001, "loss": 4.2089, "loss/crossentropy": 2.069806933403015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19368857145309448, "step": 27332 }, { "epoch": 0.54668, "grad_norm": 1.9296875, "grad_norm_var": 0.043314615885416664, "learning_rate": 0.0001, "loss": 3.9046, "loss/crossentropy": 1.7913403511047363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18388576805591583, "step": 27334 }, { "epoch": 0.54672, "grad_norm": 1.890625, "grad_norm_var": 0.01974665323893229, "learning_rate": 0.0001, "loss": 3.9618, "loss/crossentropy": 1.8095470070838928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16660822927951813, "step": 27336 }, { "epoch": 0.54676, "grad_norm": 1.8828125, "grad_norm_var": 0.019551595052083332, "learning_rate": 0.0001, "loss": 3.9662, "loss/crossentropy": 1.9614168405532837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1717158928513527, "step": 27338 }, { "epoch": 0.5468, "grad_norm": 1.90625, "grad_norm_var": 0.0172607421875, "learning_rate": 0.0001, "loss": 4.1897, "loss/crossentropy": 1.9303128719329834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18384890258312225, "step": 27340 }, { "epoch": 0.54684, "grad_norm": 1.9765625, "grad_norm_var": 0.014204915364583333, "learning_rate": 0.0001, "loss": 3.8205, "loss/crossentropy": 1.675184428691864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17097727209329605, "step": 27342 }, { "epoch": 0.54688, "grad_norm": 2.078125, "grad_norm_var": 0.013331858317057292, "learning_rate": 0.0001, "loss": 3.817, "loss/crossentropy": 2.0593711137771606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22880251705646515, "step": 27344 }, { "epoch": 0.54692, "grad_norm": 1.875, "grad_norm_var": 0.011424763997395834, "learning_rate": 0.0001, "loss": 3.7013, "loss/crossentropy": 1.79487544298172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1764550730586052, "step": 27346 }, { "epoch": 0.54696, "grad_norm": 1.8046875, "grad_norm_var": 0.00430908203125, "learning_rate": 0.0001, "loss": 3.7599, "loss/crossentropy": 2.2460497617721558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20630930364131927, "step": 27348 }, { "epoch": 0.547, "grad_norm": 1.8671875, "grad_norm_var": 0.024339803059895835, "learning_rate": 0.0001, "loss": 3.9512, "loss/crossentropy": 2.056776225566864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956087201833725, "step": 27350 }, { "epoch": 0.54704, "grad_norm": 1.953125, "grad_norm_var": 0.0264556884765625, "learning_rate": 0.0001, "loss": 4.0642, "loss/crossentropy": 2.1221381425857544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19081637263298035, "step": 27352 }, { "epoch": 0.54708, "grad_norm": 1.9375, "grad_norm_var": 0.025994618733723957, "learning_rate": 0.0001, "loss": 4.2032, "loss/crossentropy": 2.1899205446243286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19173882901668549, "step": 27354 }, { "epoch": 0.54712, "grad_norm": 2.15625, "grad_norm_var": 0.029012044270833332, "learning_rate": 0.0001, "loss": 4.3297, "loss/crossentropy": 1.9936851859092712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355274260044098, "step": 27356 }, { "epoch": 0.54716, "grad_norm": 1.9140625, "grad_norm_var": 0.02979100545247396, "learning_rate": 0.0001, "loss": 3.882, "loss/crossentropy": 1.7802082300186157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17976941913366318, "step": 27358 }, { "epoch": 0.5472, "grad_norm": 2.0, "grad_norm_var": 0.029808553059895833, "learning_rate": 0.0001, "loss": 3.9911, "loss/crossentropy": 2.1092435121536255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18953461945056915, "step": 27360 }, { "epoch": 0.54724, "grad_norm": 1.828125, "grad_norm_var": 0.031676991780598955, "learning_rate": 0.0001, "loss": 3.735, "loss/crossentropy": 1.9264041185379028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1695871204137802, "step": 27362 }, { "epoch": 0.54728, "grad_norm": 2.015625, "grad_norm_var": 0.028816731770833333, "learning_rate": 0.0001, "loss": 3.724, "loss/crossentropy": 2.0451253056526184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19317582994699478, "step": 27364 }, { "epoch": 0.54732, "grad_norm": 1.9296875, "grad_norm_var": 0.011218007405598958, "learning_rate": 0.0001, "loss": 4.0508, "loss/crossentropy": 2.084929406642914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20370616763830185, "step": 27366 }, { "epoch": 0.54736, "grad_norm": 1.9609375, "grad_norm_var": 0.008654530843098958, "learning_rate": 0.0001, "loss": 3.9552, "loss/crossentropy": 2.1887794733047485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21116343885660172, "step": 27368 }, { "epoch": 0.5474, "grad_norm": 1.875, "grad_norm_var": 0.011229451497395833, "learning_rate": 0.0001, "loss": 3.7472, "loss/crossentropy": 1.9667014479637146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17529867589473724, "step": 27370 }, { "epoch": 0.54744, "grad_norm": 2.046875, "grad_norm_var": 0.005346425374348958, "learning_rate": 0.0001, "loss": 4.1537, "loss/crossentropy": 2.351253390312195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23024218529462814, "step": 27372 }, { "epoch": 0.54748, "grad_norm": 1.9140625, "grad_norm_var": 0.006089019775390625, "learning_rate": 0.0001, "loss": 3.9757, "loss/crossentropy": 1.9090047478675842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17822134494781494, "step": 27374 }, { "epoch": 0.54752, "grad_norm": 1.9296875, "grad_norm_var": 0.005516560872395834, "learning_rate": 0.0001, "loss": 3.9657, "loss/crossentropy": 2.1193515062332153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953936368227005, "step": 27376 }, { "epoch": 0.54756, "grad_norm": 1.875, "grad_norm_var": 0.005230458577473959, "learning_rate": 0.0001, "loss": 3.9614, "loss/crossentropy": 1.8148779273033142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18782274425029755, "step": 27378 }, { "epoch": 0.5476, "grad_norm": 2.09375, "grad_norm_var": 0.0066640218098958336, "learning_rate": 0.0001, "loss": 3.9958, "loss/crossentropy": 1.8987022042274475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1779102012515068, "step": 27380 }, { "epoch": 0.54764, "grad_norm": 1.828125, "grad_norm_var": 0.0071604410807291664, "learning_rate": 0.0001, "loss": 3.8742, "loss/crossentropy": 1.7504222989082336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.173823744058609, "step": 27382 }, { "epoch": 0.54768, "grad_norm": 1.8046875, "grad_norm_var": 0.008239491780598959, "learning_rate": 0.0001, "loss": 3.8653, "loss/crossentropy": 1.9609686732292175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18690945953130722, "step": 27384 }, { "epoch": 0.54772, "grad_norm": 1.875, "grad_norm_var": 0.006639607747395833, "learning_rate": 0.0001, "loss": 3.8654, "loss/crossentropy": 1.613632321357727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15942876040935516, "step": 27386 }, { "epoch": 0.54776, "grad_norm": 1.8203125, "grad_norm_var": 0.006379191080729167, "learning_rate": 0.0001, "loss": 3.9697, "loss/crossentropy": 2.052034914493561, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18169156461954117, "step": 27388 }, { "epoch": 0.5478, "grad_norm": 1.9375, "grad_norm_var": 0.005509185791015625, "learning_rate": 0.0001, "loss": 3.8938, "loss/crossentropy": 1.9778786301612854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21691841632127762, "step": 27390 }, { "epoch": 0.54784, "grad_norm": 2.015625, "grad_norm_var": 0.010416412353515625, "learning_rate": 0.0001, "loss": 4.2751, "loss/crossentropy": 2.272818922996521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26315005123615265, "step": 27392 }, { "epoch": 0.54788, "grad_norm": 1.9140625, "grad_norm_var": 0.010335286458333334, "learning_rate": 0.0001, "loss": 3.9688, "loss/crossentropy": 1.8213757872581482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1901681274175644, "step": 27394 }, { "epoch": 0.54792, "grad_norm": 1.9375, "grad_norm_var": 0.0095855712890625, "learning_rate": 0.0001, "loss": 4.131, "loss/crossentropy": 2.0346380472183228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18924283981323242, "step": 27396 }, { "epoch": 0.54796, "grad_norm": 1.8984375, "grad_norm_var": 0.009738922119140625, "learning_rate": 0.0001, "loss": 4.1628, "loss/crossentropy": 1.9133538603782654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124219387769699, "step": 27398 }, { "epoch": 0.548, "grad_norm": 1.90625, "grad_norm_var": 0.008540852864583334, "learning_rate": 0.0001, "loss": 4.0782, "loss/crossentropy": 2.141755998134613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012597739696503, "step": 27400 }, { "epoch": 0.54804, "grad_norm": 1.984375, "grad_norm_var": 0.008104451497395833, "learning_rate": 0.0001, "loss": 3.9103, "loss/crossentropy": 2.031430244445801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892971396446228, "step": 27402 }, { "epoch": 0.54808, "grad_norm": 1.953125, "grad_norm_var": 0.0066220601399739586, "learning_rate": 0.0001, "loss": 4.1763, "loss/crossentropy": 2.3186575174331665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.231618270277977, "step": 27404 }, { "epoch": 0.54812, "grad_norm": 1.953125, "grad_norm_var": 0.007496897379557292, "learning_rate": 0.0001, "loss": 3.7874, "loss/crossentropy": 1.719622254371643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1646794229745865, "step": 27406 }, { "epoch": 0.54816, "grad_norm": 2.0625, "grad_norm_var": 0.008379872639973958, "learning_rate": 0.0001, "loss": 4.5104, "loss/crossentropy": 2.099879503250122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20306643843650818, "step": 27408 }, { "epoch": 0.5482, "grad_norm": 2.03125, "grad_norm_var": 0.008553059895833333, "learning_rate": 0.0001, "loss": 4.1404, "loss/crossentropy": 2.104840338230133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727242529392242, "step": 27410 }, { "epoch": 0.54824, "grad_norm": 1.890625, "grad_norm_var": 0.009256744384765625, "learning_rate": 0.0001, "loss": 4.026, "loss/crossentropy": 2.3337541818618774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19424156099557877, "step": 27412 }, { "epoch": 0.54828, "grad_norm": 1.7578125, "grad_norm_var": 0.01055908203125, "learning_rate": 0.0001, "loss": 3.8915, "loss/crossentropy": 1.751102864742279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1715778112411499, "step": 27414 }, { "epoch": 0.54832, "grad_norm": 1.890625, "grad_norm_var": 0.0109771728515625, "learning_rate": 0.0001, "loss": 4.2044, "loss/crossentropy": 2.2091389894485474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2181248739361763, "step": 27416 }, { "epoch": 0.54836, "grad_norm": 1.984375, "grad_norm_var": 0.011226145426432292, "learning_rate": 0.0001, "loss": 4.0839, "loss/crossentropy": 2.1870399713516235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20490305125713348, "step": 27418 }, { "epoch": 0.5484, "grad_norm": 1.9765625, "grad_norm_var": 0.011344146728515626, "learning_rate": 0.0001, "loss": 4.0857, "loss/crossentropy": 2.4501689672470093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22984202951192856, "step": 27420 }, { "epoch": 0.54844, "grad_norm": 1.84375, "grad_norm_var": 0.011771392822265626, "learning_rate": 0.0001, "loss": 3.9126, "loss/crossentropy": 1.8921697735786438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1749446839094162, "step": 27422 }, { "epoch": 0.54848, "grad_norm": 1.828125, "grad_norm_var": 0.008036041259765625, "learning_rate": 0.0001, "loss": 3.8426, "loss/crossentropy": 1.6973050236701965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1689457967877388, "step": 27424 }, { "epoch": 0.54852, "grad_norm": 1.8828125, "grad_norm_var": 0.006257883707682292, "learning_rate": 0.0001, "loss": 3.9369, "loss/crossentropy": 1.7454912066459656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17953843623399734, "step": 27426 }, { "epoch": 0.54856, "grad_norm": 2.15625, "grad_norm_var": 0.0097412109375, "learning_rate": 0.0001, "loss": 4.2128, "loss/crossentropy": 2.327135920524597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150314748287201, "step": 27428 }, { "epoch": 0.5486, "grad_norm": 1.8671875, "grad_norm_var": 0.008746083577473958, "learning_rate": 0.0001, "loss": 4.1777, "loss/crossentropy": 2.273073196411133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20920293778181076, "step": 27430 }, { "epoch": 0.54864, "grad_norm": 1.9453125, "grad_norm_var": 0.0080718994140625, "learning_rate": 0.0001, "loss": 3.9236, "loss/crossentropy": 1.474639356136322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16699142009019852, "step": 27432 }, { "epoch": 0.54868, "grad_norm": 1.984375, "grad_norm_var": 0.01053466796875, "learning_rate": 0.0001, "loss": 4.1162, "loss/crossentropy": 2.116548180580139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.225176103413105, "step": 27434 }, { "epoch": 0.54872, "grad_norm": 1.953125, "grad_norm_var": 0.013060506184895833, "learning_rate": 0.0001, "loss": 3.7837, "loss/crossentropy": 2.1261587142944336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20171497762203217, "step": 27436 }, { "epoch": 0.54876, "grad_norm": 1.9296875, "grad_norm_var": 0.012074534098307292, "learning_rate": 0.0001, "loss": 4.1137, "loss/crossentropy": 2.0924129486083984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012445107102394, "step": 27438 }, { "epoch": 0.5488, "grad_norm": 1.953125, "grad_norm_var": 0.011008453369140626, "learning_rate": 0.0001, "loss": 3.934, "loss/crossentropy": 1.7499247789382935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17569267749786377, "step": 27440 }, { "epoch": 0.54884, "grad_norm": 1.9453125, "grad_norm_var": 0.0103424072265625, "learning_rate": 0.0001, "loss": 3.7889, "loss/crossentropy": 2.0465195178985596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17263885587453842, "step": 27442 }, { "epoch": 0.54888, "grad_norm": 1.90625, "grad_norm_var": 0.0085113525390625, "learning_rate": 0.0001, "loss": 3.873, "loss/crossentropy": 1.8351019620895386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929614618420601, "step": 27444 }, { "epoch": 0.54892, "grad_norm": 1.875, "grad_norm_var": 0.008119455973307292, "learning_rate": 0.0001, "loss": 3.7952, "loss/crossentropy": 1.8877743482589722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18555018305778503, "step": 27446 }, { "epoch": 0.54896, "grad_norm": 1.921875, "grad_norm_var": 0.009757486979166667, "learning_rate": 0.0001, "loss": 3.7006, "loss/crossentropy": 1.7358683943748474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18054651468992233, "step": 27448 }, { "epoch": 0.549, "grad_norm": 1.796875, "grad_norm_var": 0.009178670247395833, "learning_rate": 0.0001, "loss": 3.9719, "loss/crossentropy": 2.0366458892822266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056865617632866, "step": 27450 }, { "epoch": 0.54904, "grad_norm": 2.109375, "grad_norm_var": 0.0085113525390625, "learning_rate": 0.0001, "loss": 4.0996, "loss/crossentropy": 2.078625202178955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962544023990631, "step": 27452 }, { "epoch": 0.54908, "grad_norm": 1.8203125, "grad_norm_var": 0.009590657552083333, "learning_rate": 0.0001, "loss": 3.6335, "loss/crossentropy": 1.6732112765312195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1725183129310608, "step": 27454 }, { "epoch": 0.54912, "grad_norm": 1.8359375, "grad_norm_var": 0.009419759114583334, "learning_rate": 0.0001, "loss": 3.513, "loss/crossentropy": 1.8087647557258606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810692474246025, "step": 27456 }, { "epoch": 0.54916, "grad_norm": 1.9765625, "grad_norm_var": 0.0110504150390625, "learning_rate": 0.0001, "loss": 4.2636, "loss/crossentropy": 2.087713837623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20880009979009628, "step": 27458 }, { "epoch": 0.5492, "grad_norm": 2.015625, "grad_norm_var": 0.010367838541666667, "learning_rate": 0.0001, "loss": 4.1659, "loss/crossentropy": 1.8009834289550781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19154398143291473, "step": 27460 }, { "epoch": 0.54924, "grad_norm": 1.90625, "grad_norm_var": 0.010994211832682291, "learning_rate": 0.0001, "loss": 4.1084, "loss/crossentropy": 2.272579550743103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21466071158647537, "step": 27462 }, { "epoch": 0.54928, "grad_norm": 1.9296875, "grad_norm_var": 0.010518391927083334, "learning_rate": 0.0001, "loss": 3.9836, "loss/crossentropy": 2.001379668712616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841510310769081, "step": 27464 }, { "epoch": 0.54932, "grad_norm": 1.8828125, "grad_norm_var": 0.008973948160807292, "learning_rate": 0.0001, "loss": 4.2364, "loss/crossentropy": 2.197210907936096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20746250450611115, "step": 27466 }, { "epoch": 0.54936, "grad_norm": 2.03125, "grad_norm_var": 0.008153279622395834, "learning_rate": 0.0001, "loss": 4.0393, "loss/crossentropy": 1.6839552521705627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19475190341472626, "step": 27468 }, { "epoch": 0.5494, "grad_norm": 2.640625, "grad_norm_var": 0.03849995930989583, "learning_rate": 0.0001, "loss": 3.8291, "loss/crossentropy": 1.8280132412910461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19640739262104034, "step": 27470 }, { "epoch": 0.54944, "grad_norm": 1.96875, "grad_norm_var": 0.03960367838541667, "learning_rate": 0.0001, "loss": 3.8258, "loss/crossentropy": 2.082128942012787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20284345746040344, "step": 27472 }, { "epoch": 0.54948, "grad_norm": 1.7890625, "grad_norm_var": 0.04104181925455729, "learning_rate": 0.0001, "loss": 4.1464, "loss/crossentropy": 2.346489429473877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972089186310768, "step": 27474 }, { "epoch": 0.54952, "grad_norm": 2.0625, "grad_norm_var": 0.041751861572265625, "learning_rate": 0.0001, "loss": 4.1133, "loss/crossentropy": 2.431715250015259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2505635917186737, "step": 27476 }, { "epoch": 0.54956, "grad_norm": 2.0, "grad_norm_var": 0.03922907511393229, "learning_rate": 0.0001, "loss": 4.1596, "loss/crossentropy": 2.012192726135254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20177678763866425, "step": 27478 }, { "epoch": 0.5496, "grad_norm": 2.015625, "grad_norm_var": 0.040726725260416666, "learning_rate": 0.0001, "loss": 3.9623, "loss/crossentropy": 2.1662270426750183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1866678223013878, "step": 27480 }, { "epoch": 0.54964, "grad_norm": 1.9453125, "grad_norm_var": 0.04033381144205729, "learning_rate": 0.0001, "loss": 4.1946, "loss/crossentropy": 2.331666111946106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118167132139206, "step": 27482 }, { "epoch": 0.54968, "grad_norm": 1.8046875, "grad_norm_var": 0.042429351806640626, "learning_rate": 0.0001, "loss": 4.084, "loss/crossentropy": 2.371376156806946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20595970004796982, "step": 27484 }, { "epoch": 0.54972, "grad_norm": 1.890625, "grad_norm_var": 0.008780670166015626, "learning_rate": 0.0001, "loss": 3.9568, "loss/crossentropy": 1.8372794389724731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18964159488677979, "step": 27486 }, { "epoch": 0.54976, "grad_norm": 1.8515625, "grad_norm_var": 0.007865142822265626, "learning_rate": 0.0001, "loss": 3.9367, "loss/crossentropy": 1.9252594709396362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19232141971588135, "step": 27488 }, { "epoch": 0.5498, "grad_norm": 1.953125, "grad_norm_var": 0.00655517578125, "learning_rate": 0.0001, "loss": 3.9779, "loss/crossentropy": 1.9566543102264404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20439066737890244, "step": 27490 }, { "epoch": 0.54984, "grad_norm": 2.03125, "grad_norm_var": 0.006776682535807292, "learning_rate": 0.0001, "loss": 3.8192, "loss/crossentropy": 1.8082122802734375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1651742085814476, "step": 27492 }, { "epoch": 0.54988, "grad_norm": 1.8359375, "grad_norm_var": 0.006834920247395833, "learning_rate": 0.0001, "loss": 4.0115, "loss/crossentropy": 1.9120944738388062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.181972436606884, "step": 27494 }, { "epoch": 0.54992, "grad_norm": 1.8359375, "grad_norm_var": 0.005812327067057292, "learning_rate": 0.0001, "loss": 3.9873, "loss/crossentropy": 2.134087026119232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19440361112356186, "step": 27496 }, { "epoch": 0.54996, "grad_norm": 1.8671875, "grad_norm_var": 0.007039388020833333, "learning_rate": 0.0001, "loss": 3.6912, "loss/crossentropy": 1.8519493341445923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16762658208608627, "step": 27498 }, { "epoch": 0.55, "grad_norm": 1.8359375, "grad_norm_var": 0.0065093994140625, "learning_rate": 0.0001, "loss": 3.6277, "loss/crossentropy": 2.10863196849823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21393945068120956, "step": 27500 }, { "epoch": 0.55004, "grad_norm": 2.140625, "grad_norm_var": 0.011252593994140626, "learning_rate": 0.0001, "loss": 4.3002, "loss/crossentropy": 2.226866364479065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21370790153741837, "step": 27502 }, { "epoch": 0.55008, "grad_norm": 1.796875, "grad_norm_var": 0.011749013264973959, "learning_rate": 0.0001, "loss": 3.8337, "loss/crossentropy": 1.5916427373886108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16826944053173065, "step": 27504 }, { "epoch": 0.55012, "grad_norm": 1.9375, "grad_norm_var": 0.011677805582682292, "learning_rate": 0.0001, "loss": 4.2064, "loss/crossentropy": 2.212289810180664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20235298573970795, "step": 27506 }, { "epoch": 0.55016, "grad_norm": 1.8828125, "grad_norm_var": 0.009698232014973959, "learning_rate": 0.0001, "loss": 3.8529, "loss/crossentropy": 1.9228017926216125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19821560382843018, "step": 27508 }, { "epoch": 0.5502, "grad_norm": 1.953125, "grad_norm_var": 0.010247548421223959, "learning_rate": 0.0001, "loss": 3.9431, "loss/crossentropy": 1.8887941241264343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20294177532196045, "step": 27510 }, { "epoch": 0.55024, "grad_norm": 1.921875, "grad_norm_var": 0.010456339518229166, "learning_rate": 0.0001, "loss": 4.2666, "loss/crossentropy": 2.2699393033981323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22084251046180725, "step": 27512 }, { "epoch": 0.55028, "grad_norm": 2.03125, "grad_norm_var": 0.008101145426432291, "learning_rate": 0.0001, "loss": 4.0852, "loss/crossentropy": 1.9771258234977722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1795499324798584, "step": 27514 }, { "epoch": 0.55032, "grad_norm": 1.828125, "grad_norm_var": 0.007990519205729166, "learning_rate": 0.0001, "loss": 3.7655, "loss/crossentropy": 2.0192587971687317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19595889747142792, "step": 27516 }, { "epoch": 0.55036, "grad_norm": 1.8671875, "grad_norm_var": 5.339900461832682, "learning_rate": 0.0001, "loss": 3.9772, "loss/crossentropy": 1.9080755710601807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18093246221542358, "step": 27518 }, { "epoch": 0.5504, "grad_norm": 2.0, "grad_norm_var": 5.307067616780599, "learning_rate": 0.0001, "loss": 4.2234, "loss/crossentropy": 2.237337589263916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21287637203931808, "step": 27520 }, { "epoch": 0.55044, "grad_norm": 1.8203125, "grad_norm_var": 5.324763743082682, "learning_rate": 0.0001, "loss": 3.8267, "loss/crossentropy": 1.986120343208313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16889339685440063, "step": 27522 }, { "epoch": 0.55048, "grad_norm": 1.9140625, "grad_norm_var": 5.322076161702474, "learning_rate": 0.0001, "loss": 4.0203, "loss/crossentropy": 1.9601141810417175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20309537649154663, "step": 27524 }, { "epoch": 0.55052, "grad_norm": 1.921875, "grad_norm_var": 5.332348378499349, "learning_rate": 0.0001, "loss": 4.0248, "loss/crossentropy": 1.8901299238204956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18104882538318634, "step": 27526 }, { "epoch": 0.55056, "grad_norm": 1.890625, "grad_norm_var": 5.3486480712890625, "learning_rate": 0.0001, "loss": 4.1152, "loss/crossentropy": 2.204539656639099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20814384520053864, "step": 27528 }, { "epoch": 0.5506, "grad_norm": 1.6796875, "grad_norm_var": 5.393595377604167, "learning_rate": 0.0001, "loss": 3.6623, "loss/crossentropy": 1.8643839955329895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1773769110441208, "step": 27530 }, { "epoch": 0.55064, "grad_norm": 1.8359375, "grad_norm_var": 5.410757446289063, "learning_rate": 0.0001, "loss": 3.5404, "loss/crossentropy": 1.66867595911026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1539328545331955, "step": 27532 }, { "epoch": 0.55068, "grad_norm": 1.8203125, "grad_norm_var": 0.0146484375, "learning_rate": 0.0001, "loss": 3.87, "loss/crossentropy": 1.8830199241638184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18702629208564758, "step": 27534 }, { "epoch": 0.55072, "grad_norm": 1.8203125, "grad_norm_var": 0.007793935139973959, "learning_rate": 0.0001, "loss": 4.0369, "loss/crossentropy": 1.8840887546539307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17654889076948166, "step": 27536 }, { "epoch": 0.55076, "grad_norm": 1.9140625, "grad_norm_var": 0.008196767171223958, "learning_rate": 0.0001, "loss": 4.2076, "loss/crossentropy": 1.9453116655349731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726330816745758, "step": 27538 }, { "epoch": 0.5508, "grad_norm": 1.953125, "grad_norm_var": 0.008133951822916667, "learning_rate": 0.0001, "loss": 3.9739, "loss/crossentropy": 2.0022284388542175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18195754289627075, "step": 27540 }, { "epoch": 0.55084, "grad_norm": 1.953125, "grad_norm_var": 0.0090240478515625, "learning_rate": 0.0001, "loss": 4.2032, "loss/crossentropy": 2.164666533470154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20948568731546402, "step": 27542 }, { "epoch": 0.55088, "grad_norm": 1.9453125, "grad_norm_var": 0.00947265625, "learning_rate": 0.0001, "loss": 4.0897, "loss/crossentropy": 1.8899564146995544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793677881360054, "step": 27544 }, { "epoch": 0.55092, "grad_norm": 1.9296875, "grad_norm_var": 0.008648427327473958, "learning_rate": 0.0001, "loss": 4.0669, "loss/crossentropy": 1.914944589138031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21741337329149246, "step": 27546 }, { "epoch": 0.55096, "grad_norm": 2.140625, "grad_norm_var": 0.0091552734375, "learning_rate": 0.0001, "loss": 3.9413, "loss/crossentropy": 1.8026766180992126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1790105178952217, "step": 27548 }, { "epoch": 0.551, "grad_norm": 2.015625, "grad_norm_var": 0.0085601806640625, "learning_rate": 0.0001, "loss": 4.2072, "loss/crossentropy": 2.1936656832695007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21415145695209503, "step": 27550 }, { "epoch": 0.55104, "grad_norm": 1.890625, "grad_norm_var": 0.0077056884765625, "learning_rate": 0.0001, "loss": 4.0292, "loss/crossentropy": 1.841277301311493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19467134028673172, "step": 27552 }, { "epoch": 0.55108, "grad_norm": 1.96875, "grad_norm_var": 0.0077056884765625, "learning_rate": 0.0001, "loss": 4.1539, "loss/crossentropy": 1.9853840470314026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18696378916502, "step": 27554 }, { "epoch": 0.55112, "grad_norm": 1.8984375, "grad_norm_var": 0.006740061442057291, "learning_rate": 0.0001, "loss": 4.0032, "loss/crossentropy": 2.213033676147461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19526077806949615, "step": 27556 }, { "epoch": 0.55116, "grad_norm": 1.9375, "grad_norm_var": 0.00665283203125, "learning_rate": 0.0001, "loss": 3.9773, "loss/crossentropy": 1.8020763397216797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1804225742816925, "step": 27558 }, { "epoch": 0.5512, "grad_norm": 1.9140625, "grad_norm_var": 0.005838775634765625, "learning_rate": 0.0001, "loss": 4.0551, "loss/crossentropy": 1.899497926235199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19439035654067993, "step": 27560 }, { "epoch": 0.55124, "grad_norm": 2.046875, "grad_norm_var": 0.00552978515625, "learning_rate": 0.0001, "loss": 3.9789, "loss/crossentropy": 1.9123604893684387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18431273847818375, "step": 27562 }, { "epoch": 0.55128, "grad_norm": 2.09375, "grad_norm_var": 0.004510243733723958, "learning_rate": 0.0001, "loss": 4.3084, "loss/crossentropy": 2.3199894428253174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21417556703090668, "step": 27564 }, { "epoch": 0.55132, "grad_norm": 1.8125, "grad_norm_var": 0.0055653889973958336, "learning_rate": 0.0001, "loss": 3.8603, "loss/crossentropy": 2.3096247911453247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19787238538265228, "step": 27566 }, { "epoch": 0.55136, "grad_norm": 1.859375, "grad_norm_var": 0.005861155192057292, "learning_rate": 0.0001, "loss": 4.083, "loss/crossentropy": 2.2241322994232178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21322394907474518, "step": 27568 }, { "epoch": 0.5514, "grad_norm": 1.9765625, "grad_norm_var": 0.005850982666015625, "learning_rate": 0.0001, "loss": 3.8552, "loss/crossentropy": 2.3283156156539917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21560409665107727, "step": 27570 }, { "epoch": 0.55144, "grad_norm": 2.0, "grad_norm_var": 0.005352528889973959, "learning_rate": 0.0001, "loss": 4.0709, "loss/crossentropy": 2.07872211933136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20099805295467377, "step": 27572 }, { "epoch": 0.55148, "grad_norm": 2.0, "grad_norm_var": 0.005273183186848958, "learning_rate": 0.0001, "loss": 4.0868, "loss/crossentropy": 2.1145899295806885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19824448227882385, "step": 27574 }, { "epoch": 0.55152, "grad_norm": 1.640625, "grad_norm_var": 0.012674967447916666, "learning_rate": 0.0001, "loss": 3.4628, "loss/crossentropy": 1.5884234309196472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1501767933368683, "step": 27576 }, { "epoch": 0.55156, "grad_norm": 1.875, "grad_norm_var": 0.0121002197265625, "learning_rate": 0.0001, "loss": 3.6905, "loss/crossentropy": 1.9045534133911133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19776541739702225, "step": 27578 }, { "epoch": 0.5516, "grad_norm": 1.796875, "grad_norm_var": 0.010337066650390626, "learning_rate": 0.0001, "loss": 3.862, "loss/crossentropy": 1.6315965056419373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15820547938346863, "step": 27580 }, { "epoch": 0.55164, "grad_norm": 2.109375, "grad_norm_var": 0.011790974934895834, "learning_rate": 0.0001, "loss": 4.298, "loss/crossentropy": 2.1012359261512756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19510812312364578, "step": 27582 }, { "epoch": 0.55168, "grad_norm": 1.9140625, "grad_norm_var": 0.012261708577473959, "learning_rate": 0.0001, "loss": 3.8833, "loss/crossentropy": 1.8365219831466675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16663482040166855, "step": 27584 }, { "epoch": 0.55172, "grad_norm": 1.9453125, "grad_norm_var": 0.012181599934895834, "learning_rate": 0.0001, "loss": 3.6868, "loss/crossentropy": 1.6274245977401733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1516888439655304, "step": 27586 }, { "epoch": 0.55176, "grad_norm": 1.921875, "grad_norm_var": 0.011529286702473959, "learning_rate": 0.0001, "loss": 3.8555, "loss/crossentropy": 1.6476022601127625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17268501967191696, "step": 27588 }, { "epoch": 0.5518, "grad_norm": 1.8984375, "grad_norm_var": 0.010400390625, "learning_rate": 0.0001, "loss": 3.7764, "loss/crossentropy": 2.135971188545227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20169296115636826, "step": 27590 }, { "epoch": 0.55184, "grad_norm": 1.8046875, "grad_norm_var": 0.006640625, "learning_rate": 0.0001, "loss": 3.7435, "loss/crossentropy": 2.2080432176589966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20463210344314575, "step": 27592 }, { "epoch": 0.55188, "grad_norm": 2.015625, "grad_norm_var": 0.008131663004557291, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 1.953204333782196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18644939363002777, "step": 27594 }, { "epoch": 0.55192, "grad_norm": 1.8671875, "grad_norm_var": 0.007429758707682292, "learning_rate": 0.0001, "loss": 4.1851, "loss/crossentropy": 1.7578163146972656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816823109984398, "step": 27596 }, { "epoch": 0.55196, "grad_norm": 1.7578125, "grad_norm_var": 0.006193033854166667, "learning_rate": 0.0001, "loss": 3.7816, "loss/crossentropy": 1.9401060938835144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1864597350358963, "step": 27598 }, { "epoch": 0.552, "grad_norm": 2.140625, "grad_norm_var": 0.009455362955729166, "learning_rate": 0.0001, "loss": 3.8935, "loss/crossentropy": 2.03184574842453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1829465553164482, "step": 27600 }, { "epoch": 0.55204, "grad_norm": 1.7421875, "grad_norm_var": 0.010365549723307292, "learning_rate": 0.0001, "loss": 3.6649, "loss/crossentropy": 2.15334689617157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18497572094202042, "step": 27602 }, { "epoch": 0.55208, "grad_norm": 2.078125, "grad_norm_var": 0.012918853759765625, "learning_rate": 0.0001, "loss": 4.0528, "loss/crossentropy": 1.894173800945282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1757444217801094, "step": 27604 }, { "epoch": 0.55212, "grad_norm": 1.9375, "grad_norm_var": 0.0134765625, "learning_rate": 0.0001, "loss": 3.9785, "loss/crossentropy": 1.8364281058311462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17648981511592865, "step": 27606 }, { "epoch": 0.55216, "grad_norm": 1.875, "grad_norm_var": 0.012882232666015625, "learning_rate": 0.0001, "loss": 3.8651, "loss/crossentropy": 1.5523585081100464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1650981828570366, "step": 27608 }, { "epoch": 0.5522, "grad_norm": 2.0, "grad_norm_var": 0.013887532552083333, "learning_rate": 0.0001, "loss": 3.8205, "loss/crossentropy": 2.1311700344085693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19313734769821167, "step": 27610 }, { "epoch": 0.55224, "grad_norm": 1.84375, "grad_norm_var": 0.014149729410807292, "learning_rate": 0.0001, "loss": 3.8613, "loss/crossentropy": 2.23338782787323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21325203776359558, "step": 27612 }, { "epoch": 0.55228, "grad_norm": 1.75, "grad_norm_var": 0.013736724853515625, "learning_rate": 0.0001, "loss": 3.7978, "loss/crossentropy": 2.0709031224250793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17793143540620804, "step": 27614 }, { "epoch": 0.55232, "grad_norm": 1.8046875, "grad_norm_var": 0.010505167643229167, "learning_rate": 0.0001, "loss": 3.8325, "loss/crossentropy": 1.758750557899475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1758880391716957, "step": 27616 }, { "epoch": 0.55236, "grad_norm": 1.765625, "grad_norm_var": 0.010353342692057291, "learning_rate": 0.0001, "loss": 3.695, "loss/crossentropy": 1.8869126439094543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16803206503391266, "step": 27618 }, { "epoch": 0.5524, "grad_norm": 2.046875, "grad_norm_var": 0.010982004801432292, "learning_rate": 0.0001, "loss": 4.1633, "loss/crossentropy": 2.5184292793273926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22967734932899475, "step": 27620 }, { "epoch": 0.55244, "grad_norm": 3.859375, "grad_norm_var": 0.25396906534830727, "learning_rate": 0.0001, "loss": 3.9171, "loss/crossentropy": 2.2767695784568787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21888408064842224, "step": 27622 }, { "epoch": 0.55248, "grad_norm": 2.015625, "grad_norm_var": 0.2516009012858073, "learning_rate": 0.0001, "loss": 4.0942, "loss/crossentropy": 2.015432834625244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19384412467479706, "step": 27624 }, { "epoch": 0.55252, "grad_norm": 1.921875, "grad_norm_var": 0.2524879455566406, "learning_rate": 0.0001, "loss": 4.1693, "loss/crossentropy": 1.9935010075569153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20147496461868286, "step": 27626 }, { "epoch": 0.55256, "grad_norm": 2.0625, "grad_norm_var": 0.24866714477539062, "learning_rate": 0.0001, "loss": 4.1237, "loss/crossentropy": 2.0798122882843018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19859570264816284, "step": 27628 }, { "epoch": 0.5526, "grad_norm": 1.890625, "grad_norm_var": 0.24353001912434896, "learning_rate": 0.0001, "loss": 3.7006, "loss/crossentropy": 1.871802031993866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1913897469639778, "step": 27630 }, { "epoch": 0.55264, "grad_norm": 2.0625, "grad_norm_var": 0.2379351298014323, "learning_rate": 0.0001, "loss": 3.9546, "loss/crossentropy": 2.184646248817444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1939481720328331, "step": 27632 }, { "epoch": 0.55268, "grad_norm": 1.9140625, "grad_norm_var": 0.23611831665039062, "learning_rate": 0.0001, "loss": 3.8362, "loss/crossentropy": 2.0267680883407593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20360219478607178, "step": 27634 }, { "epoch": 0.55272, "grad_norm": 1.9921875, "grad_norm_var": 0.2367754618326823, "learning_rate": 0.0001, "loss": 4.2263, "loss/crossentropy": 2.2218815088272095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20947173982858658, "step": 27636 }, { "epoch": 0.55276, "grad_norm": 2.15625, "grad_norm_var": 0.010217030843098959, "learning_rate": 0.0001, "loss": 4.2844, "loss/crossentropy": 1.982936441898346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1881447434425354, "step": 27638 }, { "epoch": 0.5528, "grad_norm": 1.90625, "grad_norm_var": 0.008601888020833334, "learning_rate": 0.0001, "loss": 3.8908, "loss/crossentropy": 1.8088389039039612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1668427437543869, "step": 27640 }, { "epoch": 0.55284, "grad_norm": 1.890625, "grad_norm_var": 0.008735911051432291, "learning_rate": 0.0001, "loss": 4.214, "loss/crossentropy": 2.0886768102645874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919582262635231, "step": 27642 }, { "epoch": 0.55288, "grad_norm": 2.90625, "grad_norm_var": 0.06743748982747395, "learning_rate": 0.0001, "loss": 4.3581, "loss/crossentropy": 2.0147945880889893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20268514752388, "step": 27644 }, { "epoch": 0.55292, "grad_norm": 1.90625, "grad_norm_var": 0.06584447224934896, "learning_rate": 0.0001, "loss": 4.0147, "loss/crossentropy": 1.8306025266647339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1949317753314972, "step": 27646 }, { "epoch": 0.55296, "grad_norm": 1.8515625, "grad_norm_var": 0.06684137980143229, "learning_rate": 0.0001, "loss": 4.0944, "loss/crossentropy": 2.201099991798401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19578266888856888, "step": 27648 }, { "epoch": 0.553, "grad_norm": 1.875, "grad_norm_var": 0.06663792928059896, "learning_rate": 0.0001, "loss": 3.9902, "loss/crossentropy": 2.159493327140808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18864253163337708, "step": 27650 }, { "epoch": 0.55304, "grad_norm": 1.828125, "grad_norm_var": 0.06830215454101562, "learning_rate": 0.0001, "loss": 3.9129, "loss/crossentropy": 2.0548607110977173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18820471316576004, "step": 27652 }, { "epoch": 0.55308, "grad_norm": 1.8984375, "grad_norm_var": 0.07153701782226562, "learning_rate": 0.0001, "loss": 3.4452, "loss/crossentropy": 1.8324944972991943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19259792566299438, "step": 27654 }, { "epoch": 0.55312, "grad_norm": 2.71875, "grad_norm_var": 0.10803120930989583, "learning_rate": 0.0001, "loss": 4.2944, "loss/crossentropy": 1.8857054114341736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1885119453072548, "step": 27656 }, { "epoch": 0.55316, "grad_norm": 1.9921875, "grad_norm_var": 0.10672200520833333, "learning_rate": 0.0001, "loss": 4.1616, "loss/crossentropy": 2.0037535429000854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955013945698738, "step": 27658 }, { "epoch": 0.5532, "grad_norm": 1.8515625, "grad_norm_var": 0.05010579427083333, "learning_rate": 0.0001, "loss": 3.8527, "loss/crossentropy": 1.945756196975708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17360417544841766, "step": 27660 }, { "epoch": 0.55324, "grad_norm": 1.8125, "grad_norm_var": 0.05109456380208333, "learning_rate": 0.0001, "loss": 3.8767, "loss/crossentropy": 1.8675792217254639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17810215801000595, "step": 27662 }, { "epoch": 0.55328, "grad_norm": 1.8515625, "grad_norm_var": 0.05110041300455729, "learning_rate": 0.0001, "loss": 4.1631, "loss/crossentropy": 1.9741141200065613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17763611674308777, "step": 27664 }, { "epoch": 0.55332, "grad_norm": 2.09375, "grad_norm_var": 0.05176493326822917, "learning_rate": 0.0001, "loss": 3.9846, "loss/crossentropy": 2.123018801212311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710921823978424, "step": 27666 }, { "epoch": 0.55336, "grad_norm": 1.8828125, "grad_norm_var": 0.05128758748372396, "learning_rate": 0.0001, "loss": 3.7929, "loss/crossentropy": 2.043752074241638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1959235817193985, "step": 27668 }, { "epoch": 0.5534, "grad_norm": 1.984375, "grad_norm_var": 0.04549560546875, "learning_rate": 0.0001, "loss": 4.0634, "loss/crossentropy": 1.9128360748291016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18556588888168335, "step": 27670 }, { "epoch": 0.55344, "grad_norm": 2.046875, "grad_norm_var": 0.008585611979166666, "learning_rate": 0.0001, "loss": 4.2678, "loss/crossentropy": 2.1376100182533264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1985430046916008, "step": 27672 }, { "epoch": 0.55348, "grad_norm": 2.03125, "grad_norm_var": 0.0088775634765625, "learning_rate": 0.0001, "loss": 4.0457, "loss/crossentropy": 2.106353759765625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20115973055362701, "step": 27674 }, { "epoch": 0.55352, "grad_norm": 2.015625, "grad_norm_var": 0.008316802978515624, "learning_rate": 0.0001, "loss": 4.0802, "loss/crossentropy": 2.5206472873687744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21297650784254074, "step": 27676 }, { "epoch": 0.55356, "grad_norm": 1.875, "grad_norm_var": 0.008229319254557292, "learning_rate": 0.0001, "loss": 3.9164, "loss/crossentropy": 1.8287059664726257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16045409440994263, "step": 27678 }, { "epoch": 0.5536, "grad_norm": 1.8828125, "grad_norm_var": 0.007610829671223959, "learning_rate": 0.0001, "loss": 3.9755, "loss/crossentropy": 2.103208303451538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20481672883033752, "step": 27680 }, { "epoch": 0.55364, "grad_norm": 1.984375, "grad_norm_var": 0.006689453125, "learning_rate": 0.0001, "loss": 4.0925, "loss/crossentropy": 1.6746403574943542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16627443581819534, "step": 27682 }, { "epoch": 0.55368, "grad_norm": 2.03125, "grad_norm_var": 0.005954742431640625, "learning_rate": 0.0001, "loss": 3.9342, "loss/crossentropy": 1.9239201545715332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19345758855342865, "step": 27684 }, { "epoch": 0.55372, "grad_norm": 1.8515625, "grad_norm_var": 0.006009674072265625, "learning_rate": 0.0001, "loss": 3.9512, "loss/crossentropy": 2.052439272403717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18695925921201706, "step": 27686 }, { "epoch": 0.55376, "grad_norm": 1.984375, "grad_norm_var": 0.005476633707682292, "learning_rate": 0.0001, "loss": 4.3674, "loss/crossentropy": 2.4474085569381714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21558378636837006, "step": 27688 }, { "epoch": 0.5538, "grad_norm": 1.9921875, "grad_norm_var": 0.006062825520833333, "learning_rate": 0.0001, "loss": 3.6432, "loss/crossentropy": 1.8521843552589417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16763968020677567, "step": 27690 }, { "epoch": 0.55384, "grad_norm": 1.875, "grad_norm_var": 0.0060455322265625, "learning_rate": 0.0001, "loss": 3.9086, "loss/crossentropy": 1.888016939163208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.185853011906147, "step": 27692 }, { "epoch": 0.55388, "grad_norm": 1.8828125, "grad_norm_var": 0.005830637613932292, "learning_rate": 0.0001, "loss": 3.8378, "loss/crossentropy": 1.7538975477218628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854776367545128, "step": 27694 }, { "epoch": 0.55392, "grad_norm": 1.9765625, "grad_norm_var": 0.005936431884765625, "learning_rate": 0.0001, "loss": 4.1631, "loss/crossentropy": 2.259602427482605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21480850875377655, "step": 27696 }, { "epoch": 0.55396, "grad_norm": 1.875, "grad_norm_var": 0.006860097249348958, "learning_rate": 0.0001, "loss": 4.0021, "loss/crossentropy": 2.0374972820281982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20546026527881622, "step": 27698 }, { "epoch": 0.554, "grad_norm": 1.9765625, "grad_norm_var": 0.006392415364583333, "learning_rate": 0.0001, "loss": 4.1568, "loss/crossentropy": 2.15248441696167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19681835174560547, "step": 27700 }, { "epoch": 0.55404, "grad_norm": 1.78125, "grad_norm_var": 0.0070798238118489586, "learning_rate": 0.0001, "loss": 3.7793, "loss/crossentropy": 1.8983227610588074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18331478536128998, "step": 27702 }, { "epoch": 0.55408, "grad_norm": 1.765625, "grad_norm_var": 0.00849609375, "learning_rate": 0.0001, "loss": 3.6701, "loss/crossentropy": 1.7995752692222595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17722319066524506, "step": 27704 }, { "epoch": 0.55412, "grad_norm": 1.921875, "grad_norm_var": 0.00760498046875, "learning_rate": 0.0001, "loss": 4.0987, "loss/crossentropy": 1.89347642660141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17802709341049194, "step": 27706 }, { "epoch": 0.55416, "grad_norm": 1.890625, "grad_norm_var": 0.008343251546223958, "learning_rate": 0.0001, "loss": 3.8648, "loss/crossentropy": 2.024496912956238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18508757650852203, "step": 27708 }, { "epoch": 0.5542, "grad_norm": 1.7265625, "grad_norm_var": 0.010731760660807292, "learning_rate": 0.0001, "loss": 3.7977, "loss/crossentropy": 2.0142337679862976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18837066739797592, "step": 27710 }, { "epoch": 0.55424, "grad_norm": 1.96875, "grad_norm_var": 0.009913889567057292, "learning_rate": 0.0001, "loss": 3.9401, "loss/crossentropy": 1.75287264585495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1797751560807228, "step": 27712 }, { "epoch": 0.55428, "grad_norm": 1.984375, "grad_norm_var": 0.007861073811848958, "learning_rate": 0.0001, "loss": 4.2331, "loss/crossentropy": 2.204139769077301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17446620762348175, "step": 27714 }, { "epoch": 0.55432, "grad_norm": 1.8203125, "grad_norm_var": 0.009317779541015625, "learning_rate": 0.0001, "loss": 4.1056, "loss/crossentropy": 2.0878941416740417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19087567925453186, "step": 27716 }, { "epoch": 0.55436, "grad_norm": 2.0625, "grad_norm_var": 0.0128570556640625, "learning_rate": 0.0001, "loss": 4.2309, "loss/crossentropy": 2.2289849519729614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21995704621076584, "step": 27718 }, { "epoch": 0.5544, "grad_norm": 1.84375, "grad_norm_var": 0.013140614827473958, "learning_rate": 0.0001, "loss": 4.1908, "loss/crossentropy": 2.2673051357269287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19668636471033096, "step": 27720 }, { "epoch": 0.55444, "grad_norm": 2.109375, "grad_norm_var": 0.0144439697265625, "learning_rate": 0.0001, "loss": 3.9146, "loss/crossentropy": 1.7722193598747253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17487196624279022, "step": 27722 }, { "epoch": 0.55448, "grad_norm": 2.03125, "grad_norm_var": 0.0147613525390625, "learning_rate": 0.0001, "loss": 4.2427, "loss/crossentropy": 1.8388479351997375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20283987373113632, "step": 27724 }, { "epoch": 0.55452, "grad_norm": 1.9765625, "grad_norm_var": 0.010334269205729166, "learning_rate": 0.0001, "loss": 4.1243, "loss/crossentropy": 2.2758948802948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20532135665416718, "step": 27726 }, { "epoch": 0.55456, "grad_norm": 1.9609375, "grad_norm_var": 0.009037017822265625, "learning_rate": 0.0001, "loss": 4.0048, "loss/crossentropy": 2.272206664085388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20994101464748383, "step": 27728 }, { "epoch": 0.5546, "grad_norm": 1.84375, "grad_norm_var": 0.011824289957682291, "learning_rate": 0.0001, "loss": 4.1122, "loss/crossentropy": 2.0169429779052734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21998245269060135, "step": 27730 }, { "epoch": 0.55464, "grad_norm": 1.9453125, "grad_norm_var": 0.011668904622395834, "learning_rate": 0.0001, "loss": 4.0199, "loss/crossentropy": 2.0700154900550842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20395687222480774, "step": 27732 }, { "epoch": 0.55468, "grad_norm": 1.8984375, "grad_norm_var": 0.010237630208333333, "learning_rate": 0.0001, "loss": 4.2295, "loss/crossentropy": 2.2826544046401978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1890336349606514, "step": 27734 }, { "epoch": 0.55472, "grad_norm": 1.765625, "grad_norm_var": 0.010503896077473958, "learning_rate": 0.0001, "loss": 3.8089, "loss/crossentropy": 2.2162158489227295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044035568833351, "step": 27736 }, { "epoch": 0.55476, "grad_norm": 2.0, "grad_norm_var": 0.009894816080729167, "learning_rate": 0.0001, "loss": 4.2302, "loss/crossentropy": 1.8466681838035583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18051490187644958, "step": 27738 }, { "epoch": 0.5548, "grad_norm": 1.8046875, "grad_norm_var": 0.010469309488932292, "learning_rate": 0.0001, "loss": 3.8307, "loss/crossentropy": 2.19679594039917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19721779227256775, "step": 27740 }, { "epoch": 0.55484, "grad_norm": 1.953125, "grad_norm_var": 0.010109202067057291, "learning_rate": 0.0001, "loss": 4.1634, "loss/crossentropy": 2.0091158151626587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19828540831804276, "step": 27742 }, { "epoch": 0.55488, "grad_norm": 1.9453125, "grad_norm_var": 0.010261027018229167, "learning_rate": 0.0001, "loss": 4.1095, "loss/crossentropy": 1.9434176087379456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20168793946504593, "step": 27744 }, { "epoch": 0.55492, "grad_norm": 1.8359375, "grad_norm_var": 0.0070302327473958336, "learning_rate": 0.0001, "loss": 3.9754, "loss/crossentropy": 2.365121006965637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20621124655008316, "step": 27746 }, { "epoch": 0.55496, "grad_norm": 2.203125, "grad_norm_var": 0.011714426676432292, "learning_rate": 0.0001, "loss": 4.1344, "loss/crossentropy": 2.4154850244522095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21647262573242188, "step": 27748 }, { "epoch": 0.555, "grad_norm": 1.9921875, "grad_norm_var": 0.01280517578125, "learning_rate": 0.0001, "loss": 3.8326, "loss/crossentropy": 1.9805672764778137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1818150281906128, "step": 27750 }, { "epoch": 0.55504, "grad_norm": 1.9609375, "grad_norm_var": 0.011188761393229166, "learning_rate": 0.0001, "loss": 4.0105, "loss/crossentropy": 2.1163841485977173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20045197010040283, "step": 27752 }, { "epoch": 0.55508, "grad_norm": 2.015625, "grad_norm_var": 0.010008748372395833, "learning_rate": 0.0001, "loss": 3.8754, "loss/crossentropy": 2.3555898666381836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19626620411872864, "step": 27754 }, { "epoch": 0.55512, "grad_norm": 1.9296875, "grad_norm_var": 0.007883453369140625, "learning_rate": 0.0001, "loss": 3.8355, "loss/crossentropy": 1.8587325811386108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18491162359714508, "step": 27756 }, { "epoch": 0.55516, "grad_norm": 1.9765625, "grad_norm_var": 0.009138743082682291, "learning_rate": 0.0001, "loss": 3.9636, "loss/crossentropy": 1.9335539937019348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18707312643527985, "step": 27758 }, { "epoch": 0.5552, "grad_norm": 1.796875, "grad_norm_var": 0.010953776041666667, "learning_rate": 0.0001, "loss": 4.0691, "loss/crossentropy": 1.9938113689422607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1832147091627121, "step": 27760 }, { "epoch": 0.55524, "grad_norm": 2.03125, "grad_norm_var": 0.010106404622395834, "learning_rate": 0.0001, "loss": 4.1229, "loss/crossentropy": 1.796451210975647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16813044995069504, "step": 27762 }, { "epoch": 0.55528, "grad_norm": 2.078125, "grad_norm_var": 0.0072672526041666664, "learning_rate": 0.0001, "loss": 3.9291, "loss/crossentropy": 2.2282769680023193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20037836581468582, "step": 27764 }, { "epoch": 0.55532, "grad_norm": 1.984375, "grad_norm_var": 0.007088216145833334, "learning_rate": 0.0001, "loss": 4.0021, "loss/crossentropy": 2.3429712057113647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21040766686201096, "step": 27766 }, { "epoch": 0.55536, "grad_norm": 2.03125, "grad_norm_var": 0.006932576497395833, "learning_rate": 0.0001, "loss": 4.0902, "loss/crossentropy": 2.24351704120636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19055639952421188, "step": 27768 }, { "epoch": 0.5554, "grad_norm": 2.0, "grad_norm_var": 0.007120768229166667, "learning_rate": 0.0001, "loss": 4.1444, "loss/crossentropy": 2.190573811531067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991223245859146, "step": 27770 }, { "epoch": 0.55544, "grad_norm": 1.84375, "grad_norm_var": 0.008925120035807291, "learning_rate": 0.0001, "loss": 3.9567, "loss/crossentropy": 1.9847629070281982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17740985751152039, "step": 27772 }, { "epoch": 0.55548, "grad_norm": 1.828125, "grad_norm_var": 0.008565012613932292, "learning_rate": 0.0001, "loss": 3.8379, "loss/crossentropy": 2.0242174863815308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18693319708108902, "step": 27774 }, { "epoch": 0.55552, "grad_norm": 1.8828125, "grad_norm_var": 0.008141835530598959, "learning_rate": 0.0001, "loss": 3.8359, "loss/crossentropy": 1.7930986881256104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17298821359872818, "step": 27776 }, { "epoch": 0.55556, "grad_norm": 1.7578125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 3.7851, "loss/crossentropy": 1.967544972896576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983192041516304, "step": 27778 }, { "epoch": 0.5556, "grad_norm": 2.03125, "grad_norm_var": 0.008280436197916666, "learning_rate": 0.0001, "loss": 4.2846, "loss/crossentropy": 2.353300929069519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090948149561882, "step": 27780 }, { "epoch": 0.55564, "grad_norm": 2.234375, "grad_norm_var": 0.013826243082682292, "learning_rate": 0.0001, "loss": 4.0386, "loss/crossentropy": 2.1349263191223145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19604195654392242, "step": 27782 }, { "epoch": 0.55568, "grad_norm": 1.84375, "grad_norm_var": 0.013890584309895834, "learning_rate": 0.0001, "loss": 3.9092, "loss/crossentropy": 2.1047816276550293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20156846940517426, "step": 27784 }, { "epoch": 0.55572, "grad_norm": 1.7890625, "grad_norm_var": 0.013213857014973959, "learning_rate": 0.0001, "loss": 3.8338, "loss/crossentropy": 1.9704868793487549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18116163462400436, "step": 27786 }, { "epoch": 0.55576, "grad_norm": 2.015625, "grad_norm_var": 0.013628896077473958, "learning_rate": 0.0001, "loss": 3.9997, "loss/crossentropy": 1.906118392944336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18277255445718765, "step": 27788 }, { "epoch": 0.5558, "grad_norm": 1.9140625, "grad_norm_var": 0.013449859619140626, "learning_rate": 0.0001, "loss": 4.2688, "loss/crossentropy": 2.0571773648262024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843668520450592, "step": 27790 }, { "epoch": 0.55584, "grad_norm": 1.8046875, "grad_norm_var": 0.01390380859375, "learning_rate": 0.0001, "loss": 3.8647, "loss/crossentropy": 1.9324238300323486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16451232880353928, "step": 27792 }, { "epoch": 0.55588, "grad_norm": 1.8203125, "grad_norm_var": 0.012589263916015624, "learning_rate": 0.0001, "loss": 4.1593, "loss/crossentropy": 2.27968692779541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21742059290409088, "step": 27794 }, { "epoch": 0.55592, "grad_norm": 1.9765625, "grad_norm_var": 0.012804921468098958, "learning_rate": 0.0001, "loss": 4.3922, "loss/crossentropy": 2.20041286945343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22774580121040344, "step": 27796 }, { "epoch": 0.55596, "grad_norm": 1.859375, "grad_norm_var": 0.006239573160807292, "learning_rate": 0.0001, "loss": 3.9747, "loss/crossentropy": 2.258635997772217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18686214089393616, "step": 27798 }, { "epoch": 0.556, "grad_norm": 1.875, "grad_norm_var": 0.006217193603515625, "learning_rate": 0.0001, "loss": 3.8514, "loss/crossentropy": 1.8728525638580322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1724954918026924, "step": 27800 }, { "epoch": 0.55604, "grad_norm": 2.09375, "grad_norm_var": 0.009789021809895833, "learning_rate": 0.0001, "loss": 4.3107, "loss/crossentropy": 1.7146453261375427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18469443172216415, "step": 27802 }, { "epoch": 0.55608, "grad_norm": 1.90625, "grad_norm_var": 0.009015909830729167, "learning_rate": 0.0001, "loss": 3.9967, "loss/crossentropy": 2.194948673248291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963738277554512, "step": 27804 }, { "epoch": 0.55612, "grad_norm": 1.8046875, "grad_norm_var": 0.0107177734375, "learning_rate": 0.0001, "loss": 3.7152, "loss/crossentropy": 1.7860903143882751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16485057771205902, "step": 27806 }, { "epoch": 0.55616, "grad_norm": 2.0625, "grad_norm_var": 0.011246490478515624, "learning_rate": 0.0001, "loss": 4.0281, "loss/crossentropy": 2.1505147218704224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2449631690979004, "step": 27808 }, { "epoch": 0.5562, "grad_norm": 2.015625, "grad_norm_var": 0.011139933268229167, "learning_rate": 0.0001, "loss": 4.1753, "loss/crossentropy": 2.2144479751586914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19644592702388763, "step": 27810 }, { "epoch": 0.55624, "grad_norm": 1.9921875, "grad_norm_var": 0.010796864827473959, "learning_rate": 0.0001, "loss": 4.1869, "loss/crossentropy": 2.2427414655685425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20603332668542862, "step": 27812 }, { "epoch": 0.55628, "grad_norm": 1.6875, "grad_norm_var": 0.013533528645833333, "learning_rate": 0.0001, "loss": 3.8191, "loss/crossentropy": 1.9676510691642761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16527333855628967, "step": 27814 }, { "epoch": 0.55632, "grad_norm": 1.84375, "grad_norm_var": 0.014025624593098958, "learning_rate": 0.0001, "loss": 3.823, "loss/crossentropy": 1.9767807722091675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1865728572010994, "step": 27816 }, { "epoch": 0.55636, "grad_norm": 1.9140625, "grad_norm_var": 0.008782704671223959, "learning_rate": 0.0001, "loss": 3.7126, "loss/crossentropy": 1.9406518936157227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16727876663208008, "step": 27818 }, { "epoch": 0.5564, "grad_norm": 2.046875, "grad_norm_var": 0.010550689697265626, "learning_rate": 0.0001, "loss": 3.9657, "loss/crossentropy": 2.0927204489707947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055317759513855, "step": 27820 }, { "epoch": 0.55644, "grad_norm": 2.015625, "grad_norm_var": 0.0100006103515625, "learning_rate": 0.0001, "loss": 3.825, "loss/crossentropy": 1.9616308212280273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1885625645518303, "step": 27822 }, { "epoch": 0.55648, "grad_norm": 1.828125, "grad_norm_var": 0.008890533447265625, "learning_rate": 0.0001, "loss": 3.7727, "loss/crossentropy": 1.7788268327713013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1750224232673645, "step": 27824 }, { "epoch": 0.55652, "grad_norm": 1.96875, "grad_norm_var": 0.009391021728515626, "learning_rate": 0.0001, "loss": 3.8066, "loss/crossentropy": 2.006068170070648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18516544997692108, "step": 27826 }, { "epoch": 0.55656, "grad_norm": 1.9140625, "grad_norm_var": 0.012995402018229166, "learning_rate": 0.0001, "loss": 3.8937, "loss/crossentropy": 2.046180784702301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17838794738054276, "step": 27828 }, { "epoch": 0.5566, "grad_norm": 1.984375, "grad_norm_var": 0.010526275634765625, "learning_rate": 0.0001, "loss": 3.8813, "loss/crossentropy": 1.9033851027488708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18221035599708557, "step": 27830 }, { "epoch": 0.55664, "grad_norm": 1.84375, "grad_norm_var": 0.009415435791015624, "learning_rate": 0.0001, "loss": 4.128, "loss/crossentropy": 2.1586283445358276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20454445481300354, "step": 27832 }, { "epoch": 0.55668, "grad_norm": 1.890625, "grad_norm_var": 0.009645334879557292, "learning_rate": 0.0001, "loss": 4.0045, "loss/crossentropy": 2.383496642112732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19787979125976562, "step": 27834 }, { "epoch": 0.55672, "grad_norm": 1.8828125, "grad_norm_var": 0.009150950113932292, "learning_rate": 0.0001, "loss": 3.9134, "loss/crossentropy": 2.052733063697815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916261911392212, "step": 27836 }, { "epoch": 0.55676, "grad_norm": 2.046875, "grad_norm_var": 0.009281412760416666, "learning_rate": 0.0001, "loss": 4.2263, "loss/crossentropy": 2.1427736282348633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837182566523552, "step": 27838 }, { "epoch": 0.5568, "grad_norm": 1.8046875, "grad_norm_var": 0.009651438395182291, "learning_rate": 0.0001, "loss": 3.8118, "loss/crossentropy": 2.186544895172119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20473113656044006, "step": 27840 }, { "epoch": 0.55684, "grad_norm": 1.9453125, "grad_norm_var": 0.007845052083333333, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 2.012458860874176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20121898502111435, "step": 27842 }, { "epoch": 0.55688, "grad_norm": 2.0, "grad_norm_var": 0.00859375, "learning_rate": 0.0001, "loss": 3.8415, "loss/crossentropy": 1.9217159748077393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1689399629831314, "step": 27844 }, { "epoch": 0.55692, "grad_norm": 1.9453125, "grad_norm_var": 0.007258097330729167, "learning_rate": 0.0001, "loss": 3.9051, "loss/crossentropy": 2.1690521240234375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21002651005983353, "step": 27846 }, { "epoch": 0.55696, "grad_norm": 1.84375, "grad_norm_var": 0.006992340087890625, "learning_rate": 0.0001, "loss": 3.9794, "loss/crossentropy": 2.3649327754974365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20836569368839264, "step": 27848 }, { "epoch": 0.557, "grad_norm": 2.390625, "grad_norm_var": 0.021735636393229167, "learning_rate": 0.0001, "loss": 4.1079, "loss/crossentropy": 2.118848145008087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2462398260831833, "step": 27850 }, { "epoch": 0.55704, "grad_norm": 1.8671875, "grad_norm_var": 0.022043609619140626, "learning_rate": 0.0001, "loss": 4.1345, "loss/crossentropy": 1.9461410641670227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836184561252594, "step": 27852 }, { "epoch": 0.55708, "grad_norm": 1.8671875, "grad_norm_var": 0.021817779541015624, "learning_rate": 0.0001, "loss": 4.0911, "loss/crossentropy": 2.225842833518982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19738706201314926, "step": 27854 }, { "epoch": 0.55712, "grad_norm": 1.9765625, "grad_norm_var": 0.020702870686848958, "learning_rate": 0.0001, "loss": 4.2437, "loss/crossentropy": 2.379333972930908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21520506590604782, "step": 27856 }, { "epoch": 0.55716, "grad_norm": 1.984375, "grad_norm_var": 0.020957183837890626, "learning_rate": 0.0001, "loss": 3.94, "loss/crossentropy": 1.789103090763092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16888972371816635, "step": 27858 }, { "epoch": 0.5572, "grad_norm": 2.078125, "grad_norm_var": 0.017464192708333333, "learning_rate": 0.0001, "loss": 4.1865, "loss/crossentropy": 2.1485098600387573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049245834350586, "step": 27860 }, { "epoch": 0.55724, "grad_norm": 1.9140625, "grad_norm_var": 0.017185211181640625, "learning_rate": 0.0001, "loss": 4.1494, "loss/crossentropy": 1.8173152804374695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17991861701011658, "step": 27862 }, { "epoch": 0.55728, "grad_norm": 1.9765625, "grad_norm_var": 0.016206868489583335, "learning_rate": 0.0001, "loss": 4.0881, "loss/crossentropy": 2.1297428011894226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19571492820978165, "step": 27864 }, { "epoch": 0.55732, "grad_norm": 2.109375, "grad_norm_var": 0.007212066650390625, "learning_rate": 0.0001, "loss": 3.9824, "loss/crossentropy": 2.038254678249359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918657273054123, "step": 27866 }, { "epoch": 0.55736, "grad_norm": 2.015625, "grad_norm_var": 0.006790924072265625, "learning_rate": 0.0001, "loss": 4.2425, "loss/crossentropy": 2.1835550665855408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19663546979427338, "step": 27868 }, { "epoch": 0.5574, "grad_norm": 1.96875, "grad_norm_var": 0.006021881103515625, "learning_rate": 0.0001, "loss": 4.097, "loss/crossentropy": 2.0005786418914795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20361479371786118, "step": 27870 }, { "epoch": 0.55744, "grad_norm": 1.8828125, "grad_norm_var": 0.006640625, "learning_rate": 0.0001, "loss": 4.036, "loss/crossentropy": 1.9897403717041016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18811679631471634, "step": 27872 }, { "epoch": 0.55748, "grad_norm": 2.625, "grad_norm_var": 0.035042063395182295, "learning_rate": 0.0001, "loss": 3.8279, "loss/crossentropy": 1.8575212359428406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18323712795972824, "step": 27874 }, { "epoch": 0.55752, "grad_norm": 2.296875, "grad_norm_var": 0.040537261962890626, "learning_rate": 0.0001, "loss": 3.8482, "loss/crossentropy": 2.1971237659454346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20962104946374893, "step": 27876 }, { "epoch": 0.55756, "grad_norm": 1.9609375, "grad_norm_var": 0.04003473917643229, "learning_rate": 0.0001, "loss": 4.0675, "loss/crossentropy": 2.1657967567443848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19974523782730103, "step": 27878 }, { "epoch": 0.5576, "grad_norm": 2.0625, "grad_norm_var": 0.04272028605143229, "learning_rate": 0.0001, "loss": 3.9154, "loss/crossentropy": 1.988546371459961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2359297126531601, "step": 27880 }, { "epoch": 0.55764, "grad_norm": 2.0, "grad_norm_var": 0.0413330078125, "learning_rate": 0.0001, "loss": 4.3125, "loss/crossentropy": 1.979064404964447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1763293370604515, "step": 27882 }, { "epoch": 0.55768, "grad_norm": 2.0, "grad_norm_var": 0.04175516764322917, "learning_rate": 0.0001, "loss": 4.307, "loss/crossentropy": 2.3738937377929688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21125850826501846, "step": 27884 }, { "epoch": 0.55772, "grad_norm": 1.8671875, "grad_norm_var": 0.04346110026041667, "learning_rate": 0.0001, "loss": 4.0331, "loss/crossentropy": 1.9243283867835999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22126196324825287, "step": 27886 }, { "epoch": 0.55776, "grad_norm": 1.953125, "grad_norm_var": 0.0432373046875, "learning_rate": 0.0001, "loss": 3.9017, "loss/crossentropy": 2.0033947229385376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17997342348098755, "step": 27888 }, { "epoch": 0.5578, "grad_norm": 1.9375, "grad_norm_var": 0.015396881103515624, "learning_rate": 0.0001, "loss": 4.0113, "loss/crossentropy": 2.1850863695144653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20201555639505386, "step": 27890 }, { "epoch": 0.55784, "grad_norm": 1.796875, "grad_norm_var": 0.008487955729166666, "learning_rate": 0.0001, "loss": 3.8585, "loss/crossentropy": 1.9696857333183289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20272113382816315, "step": 27892 }, { "epoch": 0.55788, "grad_norm": 1.921875, "grad_norm_var": 0.009234364827473958, "learning_rate": 0.0001, "loss": 4.2124, "loss/crossentropy": 2.014931857585907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2670851871371269, "step": 27894 }, { "epoch": 0.55792, "grad_norm": 2.8125, "grad_norm_var": 0.05494359334309896, "learning_rate": 0.0001, "loss": 3.91, "loss/crossentropy": 1.7789946794509888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16623269766569138, "step": 27896 }, { "epoch": 0.55796, "grad_norm": 1.7734375, "grad_norm_var": 0.05752766927083333, "learning_rate": 0.0001, "loss": 3.9915, "loss/crossentropy": 1.9287369847297668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17834855616092682, "step": 27898 }, { "epoch": 0.558, "grad_norm": 1.921875, "grad_norm_var": 0.0579254150390625, "learning_rate": 0.0001, "loss": 4.0939, "loss/crossentropy": 2.2972252368927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19952362030744553, "step": 27900 }, { "epoch": 0.55804, "grad_norm": 2.015625, "grad_norm_var": 0.05802993774414063, "learning_rate": 0.0001, "loss": 4.0678, "loss/crossentropy": 2.307905077934265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21856702864170074, "step": 27902 }, { "epoch": 0.55808, "grad_norm": 2.203125, "grad_norm_var": 0.06280008951822917, "learning_rate": 0.0001, "loss": 3.9421, "loss/crossentropy": 2.0781975984573364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20248684287071228, "step": 27904 }, { "epoch": 0.55812, "grad_norm": 2.125, "grad_norm_var": 0.06393229166666667, "learning_rate": 0.0001, "loss": 4.0949, "loss/crossentropy": 2.2437129616737366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20828180015087128, "step": 27906 }, { "epoch": 0.55816, "grad_norm": 1.8671875, "grad_norm_var": 0.06414286295572917, "learning_rate": 0.0001, "loss": 3.6518, "loss/crossentropy": 1.8736275434494019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924392208456993, "step": 27908 }, { "epoch": 0.5582, "grad_norm": 1.9296875, "grad_norm_var": 0.06412734985351562, "learning_rate": 0.0001, "loss": 3.874, "loss/crossentropy": 1.7928629517555237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1721597984433174, "step": 27910 }, { "epoch": 0.55824, "grad_norm": 1.8125, "grad_norm_var": 0.0142486572265625, "learning_rate": 0.0001, "loss": 3.9806, "loss/crossentropy": 1.8429313898086548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18428131937980652, "step": 27912 }, { "epoch": 0.55828, "grad_norm": 2.015625, "grad_norm_var": 0.015417226155598958, "learning_rate": 0.0001, "loss": 3.8051, "loss/crossentropy": 1.7473148107528687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1703418493270874, "step": 27914 }, { "epoch": 0.55832, "grad_norm": 2.0, "grad_norm_var": 0.015712229410807292, "learning_rate": 0.0001, "loss": 4.1174, "loss/crossentropy": 1.9749281406402588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19364860653877258, "step": 27916 }, { "epoch": 0.55836, "grad_norm": 1.84375, "grad_norm_var": 0.014564768473307291, "learning_rate": 0.0001, "loss": 3.9925, "loss/crossentropy": 2.0153204202651978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18824293464422226, "step": 27918 }, { "epoch": 0.5584, "grad_norm": 1.8984375, "grad_norm_var": 0.008925120035807291, "learning_rate": 0.0001, "loss": 3.9048, "loss/crossentropy": 1.984444797039032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17791762948036194, "step": 27920 }, { "epoch": 0.55844, "grad_norm": 2.09375, "grad_norm_var": 0.007846832275390625, "learning_rate": 0.0001, "loss": 4.0684, "loss/crossentropy": 2.1440568566322327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.277907058596611, "step": 27922 }, { "epoch": 0.55848, "grad_norm": 1.9140625, "grad_norm_var": 0.01706517537434896, "learning_rate": 0.0001, "loss": 4.1565, "loss/crossentropy": 2.261883854866028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21542102098464966, "step": 27924 }, { "epoch": 0.55852, "grad_norm": 1.9609375, "grad_norm_var": 0.017647298177083333, "learning_rate": 0.0001, "loss": 4.1513, "loss/crossentropy": 2.0659791827201843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17906638234853745, "step": 27926 }, { "epoch": 0.55856, "grad_norm": 1.921875, "grad_norm_var": 0.016893513997395835, "learning_rate": 0.0001, "loss": 4.1226, "loss/crossentropy": 2.536762237548828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20740076154470444, "step": 27928 }, { "epoch": 0.5586, "grad_norm": 1.9765625, "grad_norm_var": 0.014314778645833333, "learning_rate": 0.0001, "loss": 3.741, "loss/crossentropy": 1.8174605965614319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17254772782325745, "step": 27930 }, { "epoch": 0.55864, "grad_norm": 2.015625, "grad_norm_var": 0.015575154622395834, "learning_rate": 0.0001, "loss": 3.9703, "loss/crossentropy": 2.2474478483200073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22224698960781097, "step": 27932 }, { "epoch": 0.55868, "grad_norm": 2.125, "grad_norm_var": 0.016844685872395834, "learning_rate": 0.0001, "loss": 4.0159, "loss/crossentropy": 1.9788220524787903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19506333768367767, "step": 27934 }, { "epoch": 0.55872, "grad_norm": 1.8984375, "grad_norm_var": 0.016670735677083333, "learning_rate": 0.0001, "loss": 4.0164, "loss/crossentropy": 1.9576459527015686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17929108440876007, "step": 27936 }, { "epoch": 0.55876, "grad_norm": 2.09375, "grad_norm_var": 0.015987141927083334, "learning_rate": 0.0001, "loss": 4.2099, "loss/crossentropy": 2.2682799100875854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22949229925870895, "step": 27938 }, { "epoch": 0.5588, "grad_norm": 1.9453125, "grad_norm_var": 0.009235636393229166, "learning_rate": 0.0001, "loss": 3.9841, "loss/crossentropy": 2.0454965829849243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19698018580675125, "step": 27940 }, { "epoch": 0.55884, "grad_norm": 1.7734375, "grad_norm_var": 0.010729726155598958, "learning_rate": 0.0001, "loss": 4.0611, "loss/crossentropy": 2.3304296731948853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22486522793769836, "step": 27942 }, { "epoch": 0.55888, "grad_norm": 1.8984375, "grad_norm_var": 0.010538736979166666, "learning_rate": 0.0001, "loss": 3.787, "loss/crossentropy": 1.7931689620018005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16503961384296417, "step": 27944 }, { "epoch": 0.55892, "grad_norm": 1.9609375, "grad_norm_var": 0.011124674479166667, "learning_rate": 0.0001, "loss": 3.7788, "loss/crossentropy": 1.8899441957473755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18204578012228012, "step": 27946 }, { "epoch": 0.55896, "grad_norm": 2.0625, "grad_norm_var": 0.0106109619140625, "learning_rate": 0.0001, "loss": 3.9515, "loss/crossentropy": 1.9887003302574158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19222819805145264, "step": 27948 }, { "epoch": 0.559, "grad_norm": 1.875, "grad_norm_var": 0.0088531494140625, "learning_rate": 0.0001, "loss": 3.9696, "loss/crossentropy": 2.097675323486328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877489537000656, "step": 27950 }, { "epoch": 0.55904, "grad_norm": 1.8359375, "grad_norm_var": 0.009248860677083333, "learning_rate": 0.0001, "loss": 3.9556, "loss/crossentropy": 1.801637887954712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15989546477794647, "step": 27952 }, { "epoch": 0.55908, "grad_norm": 1.8984375, "grad_norm_var": 0.008756256103515625, "learning_rate": 0.0001, "loss": 4.141, "loss/crossentropy": 2.1653464436531067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17995328456163406, "step": 27954 }, { "epoch": 0.55912, "grad_norm": 1.796875, "grad_norm_var": 0.009506988525390624, "learning_rate": 0.0001, "loss": 3.8267, "loss/crossentropy": 2.370779275894165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22155411541461945, "step": 27956 }, { "epoch": 0.55916, "grad_norm": 1.875, "grad_norm_var": 0.008055623372395833, "learning_rate": 0.0001, "loss": 3.9821, "loss/crossentropy": 1.9569392204284668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20606116950511932, "step": 27958 }, { "epoch": 0.5592, "grad_norm": 2.046875, "grad_norm_var": 0.009919230143229167, "learning_rate": 0.0001, "loss": 3.9857, "loss/crossentropy": 1.8667678833007812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18434254080057144, "step": 27960 }, { "epoch": 0.55924, "grad_norm": 1.9140625, "grad_norm_var": 0.009171549479166667, "learning_rate": 0.0001, "loss": 4.0802, "loss/crossentropy": 2.4322283267974854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21796736866235733, "step": 27962 }, { "epoch": 0.55928, "grad_norm": 1.921875, "grad_norm_var": 0.0070220947265625, "learning_rate": 0.0001, "loss": 3.8252, "loss/crossentropy": 1.7290849685668945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17558299005031586, "step": 27964 }, { "epoch": 0.55932, "grad_norm": 1.9140625, "grad_norm_var": 0.006430816650390625, "learning_rate": 0.0001, "loss": 3.8143, "loss/crossentropy": 1.9967953562736511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19310998916625977, "step": 27966 }, { "epoch": 0.55936, "grad_norm": 1.953125, "grad_norm_var": 0.006685129801432292, "learning_rate": 0.0001, "loss": 3.9863, "loss/crossentropy": 2.027643382549286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1989550217986107, "step": 27968 }, { "epoch": 0.5594, "grad_norm": 1.7890625, "grad_norm_var": 0.016312408447265624, "learning_rate": 0.0001, "loss": 3.8968, "loss/crossentropy": 2.057853937149048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117058396339417, "step": 27970 }, { "epoch": 0.55944, "grad_norm": 1.9140625, "grad_norm_var": 0.015122222900390624, "learning_rate": 0.0001, "loss": 4.1212, "loss/crossentropy": 2.113202214241028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19248170405626297, "step": 27972 }, { "epoch": 0.55948, "grad_norm": 2.015625, "grad_norm_var": 0.014454905192057292, "learning_rate": 0.0001, "loss": 4.0971, "loss/crossentropy": 2.04659241437912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064732015132904, "step": 27974 }, { "epoch": 0.55952, "grad_norm": 1.8828125, "grad_norm_var": 0.012943522135416666, "learning_rate": 0.0001, "loss": 4.0117, "loss/crossentropy": 2.046691119670868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19065889716148376, "step": 27976 }, { "epoch": 0.55956, "grad_norm": 1.8671875, "grad_norm_var": 0.014945475260416667, "learning_rate": 0.0001, "loss": 3.7466, "loss/crossentropy": 1.896644115447998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838160902261734, "step": 27978 }, { "epoch": 0.5596, "grad_norm": 2.0, "grad_norm_var": 0.015411122639973959, "learning_rate": 0.0001, "loss": 3.9367, "loss/crossentropy": 1.9928682446479797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17411044239997864, "step": 27980 }, { "epoch": 0.55964, "grad_norm": 1.875, "grad_norm_var": 0.015843709309895832, "learning_rate": 0.0001, "loss": 3.9616, "loss/crossentropy": 1.9449793100357056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18892540037631989, "step": 27982 }, { "epoch": 0.55968, "grad_norm": 2.03125, "grad_norm_var": 0.0159912109375, "learning_rate": 0.0001, "loss": 3.8687, "loss/crossentropy": 1.949626863002777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1996856927871704, "step": 27984 }, { "epoch": 0.55972, "grad_norm": 1.828125, "grad_norm_var": 0.0056793212890625, "learning_rate": 0.0001, "loss": 4.0761, "loss/crossentropy": 2.116423010826111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18394631892442703, "step": 27986 }, { "epoch": 0.55976, "grad_norm": 1.8515625, "grad_norm_var": 0.006320953369140625, "learning_rate": 0.0001, "loss": 3.9168, "loss/crossentropy": 2.151313066482544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895233690738678, "step": 27988 }, { "epoch": 0.5598, "grad_norm": 2.03125, "grad_norm_var": 0.007739003499348958, "learning_rate": 0.0001, "loss": 3.8336, "loss/crossentropy": 2.1060383319854736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21665210276842117, "step": 27990 }, { "epoch": 0.55984, "grad_norm": 2.0625, "grad_norm_var": 0.009528605143229167, "learning_rate": 0.0001, "loss": 3.9715, "loss/crossentropy": 2.201907217502594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20985589921474457, "step": 27992 }, { "epoch": 0.55988, "grad_norm": 2.0, "grad_norm_var": 0.0083251953125, "learning_rate": 0.0001, "loss": 3.9765, "loss/crossentropy": 2.0219600796699524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2357076108455658, "step": 27994 }, { "epoch": 0.55992, "grad_norm": 1.984375, "grad_norm_var": 0.008166249593098958, "learning_rate": 0.0001, "loss": 4.1518, "loss/crossentropy": 2.006410300731659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20221157371997833, "step": 27996 }, { "epoch": 0.55996, "grad_norm": 2.0625, "grad_norm_var": 0.009886678059895833, "learning_rate": 0.0001, "loss": 3.842, "loss/crossentropy": 1.7009567618370056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16861556470394135, "step": 27998 }, { "epoch": 0.56, "grad_norm": 1.90625, "grad_norm_var": 0.010587310791015625, "learning_rate": 0.0001, "loss": 3.6914, "loss/crossentropy": 1.7348475456237793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15609163790941238, "step": 28000 }, { "epoch": 0.56004, "grad_norm": 2.0625, "grad_norm_var": 0.011087799072265625, "learning_rate": 0.0001, "loss": 4.0131, "loss/crossentropy": 2.253269076347351, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21236062049865723, "step": 28002 }, { "epoch": 0.56008, "grad_norm": 1.890625, "grad_norm_var": 0.01077880859375, "learning_rate": 0.0001, "loss": 3.6868, "loss/crossentropy": 1.5210962891578674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15905095636844635, "step": 28004 }, { "epoch": 0.56012, "grad_norm": 1.828125, "grad_norm_var": 0.011017862955729167, "learning_rate": 0.0001, "loss": 4.201, "loss/crossentropy": 2.201940894126892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206693597137928, "step": 28006 }, { "epoch": 0.56016, "grad_norm": 2.0, "grad_norm_var": 0.010367584228515626, "learning_rate": 0.0001, "loss": 3.979, "loss/crossentropy": 2.0980228185653687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19499729573726654, "step": 28008 }, { "epoch": 0.5602, "grad_norm": 1.96875, "grad_norm_var": 0.011413319905598959, "learning_rate": 0.0001, "loss": 4.0716, "loss/crossentropy": 2.0373584032058716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17089952528476715, "step": 28010 }, { "epoch": 0.56024, "grad_norm": 2.09375, "grad_norm_var": 0.013291168212890624, "learning_rate": 0.0001, "loss": 4.3161, "loss/crossentropy": 2.4253649711608887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2154974490404129, "step": 28012 }, { "epoch": 0.56028, "grad_norm": 1.984375, "grad_norm_var": 0.011405181884765626, "learning_rate": 0.0001, "loss": 3.9914, "loss/crossentropy": 2.121450662612915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19490933418273926, "step": 28014 }, { "epoch": 0.56032, "grad_norm": 1.859375, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 3.9158, "loss/crossentropy": 1.9926584959030151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19274179637432098, "step": 28016 }, { "epoch": 0.56036, "grad_norm": 2.09375, "grad_norm_var": 0.015559641520182292, "learning_rate": 0.0001, "loss": 3.9918, "loss/crossentropy": 1.6820538640022278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17553947865962982, "step": 28018 }, { "epoch": 0.5604, "grad_norm": 1.921875, "grad_norm_var": 0.013496653238932291, "learning_rate": 0.0001, "loss": 4.2007, "loss/crossentropy": 2.278828501701355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22723327577114105, "step": 28020 }, { "epoch": 0.56044, "grad_norm": 1.7421875, "grad_norm_var": 0.016682942708333332, "learning_rate": 0.0001, "loss": 3.4084, "loss/crossentropy": 1.860534906387329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17910753190517426, "step": 28022 }, { "epoch": 0.56048, "grad_norm": 1.734375, "grad_norm_var": 0.019945271809895835, "learning_rate": 0.0001, "loss": 3.6443, "loss/crossentropy": 2.094952940940857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19640950858592987, "step": 28024 }, { "epoch": 0.56052, "grad_norm": 2.0625, "grad_norm_var": 0.03429133097330729, "learning_rate": 0.0001, "loss": 3.9637, "loss/crossentropy": 2.2802677154541016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223518192768097, "step": 28026 }, { "epoch": 0.56056, "grad_norm": 2.078125, "grad_norm_var": 0.03408177693684896, "learning_rate": 0.0001, "loss": 4.2298, "loss/crossentropy": 2.382766842842102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22707599401474, "step": 28028 }, { "epoch": 0.5606, "grad_norm": 1.90625, "grad_norm_var": 0.033512115478515625, "learning_rate": 0.0001, "loss": 3.984, "loss/crossentropy": 1.861901044845581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17228157818317413, "step": 28030 }, { "epoch": 0.56064, "grad_norm": 1.96875, "grad_norm_var": 0.030454254150390624, "learning_rate": 0.0001, "loss": 3.8515, "loss/crossentropy": 1.8863429427146912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17535988986492157, "step": 28032 }, { "epoch": 0.56068, "grad_norm": 2.015625, "grad_norm_var": 0.02935765584309896, "learning_rate": 0.0001, "loss": 4.3787, "loss/crossentropy": 2.0987571477890015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073696255683899, "step": 28034 }, { "epoch": 0.56072, "grad_norm": 2.359375, "grad_norm_var": 0.03867975870768229, "learning_rate": 0.0001, "loss": 3.9042, "loss/crossentropy": 1.6989398002624512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1658734381198883, "step": 28036 }, { "epoch": 0.56076, "grad_norm": 2.0625, "grad_norm_var": 0.030586751302083333, "learning_rate": 0.0001, "loss": 3.8771, "loss/crossentropy": 2.007630228996277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17940524220466614, "step": 28038 }, { "epoch": 0.5608, "grad_norm": 1.78125, "grad_norm_var": 0.028888956705729166, "learning_rate": 0.0001, "loss": 3.8596, "loss/crossentropy": 2.2327693700790405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22857680916786194, "step": 28040 }, { "epoch": 0.56084, "grad_norm": 1.875, "grad_norm_var": 0.016524251302083334, "learning_rate": 0.0001, "loss": 3.9398, "loss/crossentropy": 2.051284074783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18832950294017792, "step": 28042 }, { "epoch": 0.56088, "grad_norm": 1.8984375, "grad_norm_var": 0.016627756754557292, "learning_rate": 0.0001, "loss": 3.9606, "loss/crossentropy": 1.9091659784317017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17789962142705917, "step": 28044 }, { "epoch": 0.56092, "grad_norm": 2.109375, "grad_norm_var": 0.05832112630208333, "learning_rate": 0.0001, "loss": 3.7684, "loss/crossentropy": 1.8703528046607971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24716350436210632, "step": 28046 }, { "epoch": 0.56096, "grad_norm": 1.8046875, "grad_norm_var": 0.060031890869140625, "learning_rate": 0.0001, "loss": 3.7816, "loss/crossentropy": 1.9286105036735535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17276924848556519, "step": 28048 }, { "epoch": 0.561, "grad_norm": 2.125, "grad_norm_var": 0.06181208292643229, "learning_rate": 0.0001, "loss": 4.2004, "loss/crossentropy": 2.115488290786743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20674347877502441, "step": 28050 }, { "epoch": 0.56104, "grad_norm": 1.8359375, "grad_norm_var": 0.058426920572916666, "learning_rate": 0.0001, "loss": 3.6918, "loss/crossentropy": 2.1754192113876343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19405043870210648, "step": 28052 }, { "epoch": 0.56108, "grad_norm": 2.28125, "grad_norm_var": 0.08551025390625, "learning_rate": 0.0001, "loss": 4.2559, "loss/crossentropy": 2.378780484199524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19838006794452667, "step": 28054 }, { "epoch": 0.56112, "grad_norm": 1.8984375, "grad_norm_var": 0.08651504516601563, "learning_rate": 0.0001, "loss": 4.1649, "loss/crossentropy": 1.8579109907150269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19068627059459686, "step": 28056 }, { "epoch": 0.56116, "grad_norm": 1.9453125, "grad_norm_var": 0.1222564697265625, "learning_rate": 0.0001, "loss": 4.17, "loss/crossentropy": 2.190861463546753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977207213640213, "step": 28058 }, { "epoch": 0.5612, "grad_norm": 1.8671875, "grad_norm_var": 0.15713297526041667, "learning_rate": 0.0001, "loss": 3.8599, "loss/crossentropy": 1.8596174716949463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18201489746570587, "step": 28060 }, { "epoch": 0.56124, "grad_norm": 1.9609375, "grad_norm_var": 0.1685808817545573, "learning_rate": 0.0001, "loss": 4.105, "loss/crossentropy": 1.8212996125221252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16964350640773773, "step": 28062 }, { "epoch": 0.56128, "grad_norm": 2.1875, "grad_norm_var": 0.15612691243489582, "learning_rate": 0.0001, "loss": 4.0088, "loss/crossentropy": 2.1002501845359802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20258799940347672, "step": 28064 }, { "epoch": 0.56132, "grad_norm": 2.21875, "grad_norm_var": 0.18886617024739583, "learning_rate": 0.0001, "loss": 4.0179, "loss/crossentropy": 2.1405937671661377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100813016295433, "step": 28066 }, { "epoch": 0.56136, "grad_norm": 2.90625, "grad_norm_var": 0.22138264973958333, "learning_rate": 0.0001, "loss": 3.7586, "loss/crossentropy": 2.23690402507782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19980955868959427, "step": 28068 }, { "epoch": 0.5614, "grad_norm": 3.015625, "grad_norm_var": 0.30380452473958336, "learning_rate": 0.0001, "loss": 3.5867, "loss/crossentropy": 1.875457525253296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16666759550571442, "step": 28070 }, { "epoch": 0.56144, "grad_norm": 2.78125, "grad_norm_var": 0.26413141886393227, "learning_rate": 0.0001, "loss": 4.0021, "loss/crossentropy": 2.1469756960868835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19378764182329178, "step": 28072 }, { "epoch": 0.56148, "grad_norm": 2.09375, "grad_norm_var": 0.25313212076822916, "learning_rate": 0.0001, "loss": 4.0959, "loss/crossentropy": 2.156083106994629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20279116183519363, "step": 28074 }, { "epoch": 0.56152, "grad_norm": 2.0, "grad_norm_var": 0.27230631510416664, "learning_rate": 0.0001, "loss": 3.9681, "loss/crossentropy": 1.9321695566177368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937977820634842, "step": 28076 }, { "epoch": 0.56156, "grad_norm": 2.078125, "grad_norm_var": 0.29947077433268227, "learning_rate": 0.0001, "loss": 3.6465, "loss/crossentropy": 2.005180776119232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18012206256389618, "step": 28078 }, { "epoch": 0.5616, "grad_norm": 1.7578125, "grad_norm_var": 0.35347391764322916, "learning_rate": 0.0001, "loss": 3.7987, "loss/crossentropy": 2.040120303630829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17059049755334854, "step": 28080 }, { "epoch": 0.56164, "grad_norm": 2.078125, "grad_norm_var": 0.34440511067708335, "learning_rate": 0.0001, "loss": 4.3903, "loss/crossentropy": 2.5171138048171997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21068025380373, "step": 28082 }, { "epoch": 0.56168, "grad_norm": 1.890625, "grad_norm_var": 0.2691070556640625, "learning_rate": 0.0001, "loss": 4.252, "loss/crossentropy": 2.1827582120895386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995992213487625, "step": 28084 }, { "epoch": 0.56172, "grad_norm": 2.046875, "grad_norm_var": 0.09042154947916667, "learning_rate": 0.0001, "loss": 4.1365, "loss/crossentropy": 2.0082287192344666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23266522586345673, "step": 28086 }, { "epoch": 0.56176, "grad_norm": 1.8046875, "grad_norm_var": 0.024853261311848958, "learning_rate": 0.0001, "loss": 3.8143, "loss/crossentropy": 1.59010910987854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15921758115291595, "step": 28088 }, { "epoch": 0.5618, "grad_norm": 1.921875, "grad_norm_var": 0.012132771809895833, "learning_rate": 0.0001, "loss": 4.0431, "loss/crossentropy": 2.3045564889907837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091711461544037, "step": 28090 }, { "epoch": 0.56184, "grad_norm": 1.828125, "grad_norm_var": 0.0129547119140625, "learning_rate": 0.0001, "loss": 3.8729, "loss/crossentropy": 1.9903671741485596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19884900748729706, "step": 28092 }, { "epoch": 0.56188, "grad_norm": 1.7578125, "grad_norm_var": 0.011970011393229167, "learning_rate": 0.0001, "loss": 3.9903, "loss/crossentropy": 2.3190979957580566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20256339758634567, "step": 28094 }, { "epoch": 0.56192, "grad_norm": 1.890625, "grad_norm_var": 0.009983062744140625, "learning_rate": 0.0001, "loss": 4.0022, "loss/crossentropy": 1.8695592880249023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17132485657930374, "step": 28096 }, { "epoch": 0.56196, "grad_norm": 1.8203125, "grad_norm_var": 0.008253733317057291, "learning_rate": 0.0001, "loss": 3.7549, "loss/crossentropy": 1.9170012474060059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18364953994750977, "step": 28098 }, { "epoch": 0.562, "grad_norm": 1.6328125, "grad_norm_var": 0.011749013264973959, "learning_rate": 0.0001, "loss": 3.6666, "loss/crossentropy": 1.7711463570594788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16884273290634155, "step": 28100 }, { "epoch": 0.56204, "grad_norm": 2.0, "grad_norm_var": 0.0090728759765625, "learning_rate": 0.0001, "loss": 4.1265, "loss/crossentropy": 2.216683030128479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19033723324537277, "step": 28102 }, { "epoch": 0.56208, "grad_norm": 1.8984375, "grad_norm_var": 0.008666737874348959, "learning_rate": 0.0001, "loss": 3.7862, "loss/crossentropy": 2.068448841571808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18903378397226334, "step": 28104 }, { "epoch": 0.56212, "grad_norm": 1.8515625, "grad_norm_var": 0.008378092447916667, "learning_rate": 0.0001, "loss": 3.9428, "loss/crossentropy": 1.9194504618644714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17733828723430634, "step": 28106 }, { "epoch": 0.56216, "grad_norm": 1.84375, "grad_norm_var": 0.00836181640625, "learning_rate": 0.0001, "loss": 3.8966, "loss/crossentropy": 1.8640123009681702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18660252541303635, "step": 28108 }, { "epoch": 0.5622, "grad_norm": 1.8671875, "grad_norm_var": 0.006628163655598958, "learning_rate": 0.0001, "loss": 3.883, "loss/crossentropy": 2.084853768348694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897662729024887, "step": 28110 }, { "epoch": 0.56224, "grad_norm": 1.7890625, "grad_norm_var": 0.006681060791015625, "learning_rate": 0.0001, "loss": 3.7912, "loss/crossentropy": 1.9047400951385498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16695564985275269, "step": 28112 }, { "epoch": 0.56228, "grad_norm": 1.9296875, "grad_norm_var": 0.007616170247395833, "learning_rate": 0.0001, "loss": 3.9128, "loss/crossentropy": 1.9586027264595032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18182067573070526, "step": 28114 }, { "epoch": 0.56232, "grad_norm": 1.8828125, "grad_norm_var": 0.007448069254557292, "learning_rate": 0.0001, "loss": 4.0318, "loss/crossentropy": 2.0719743371009827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18775883316993713, "step": 28116 }, { "epoch": 0.56236, "grad_norm": 2.03125, "grad_norm_var": 0.014070638020833333, "learning_rate": 0.0001, "loss": 4.5828, "loss/crossentropy": 2.2210047245025635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.32780784368515015, "step": 28118 }, { "epoch": 0.5624, "grad_norm": 1.9453125, "grad_norm_var": 0.0147125244140625, "learning_rate": 0.0001, "loss": 3.9965, "loss/crossentropy": 2.212351083755493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19030366092920303, "step": 28120 }, { "epoch": 0.56244, "grad_norm": 2.0, "grad_norm_var": 0.014939117431640624, "learning_rate": 0.0001, "loss": 4.1715, "loss/crossentropy": 2.2713009119033813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21552074700593948, "step": 28122 }, { "epoch": 0.56248, "grad_norm": 1.90625, "grad_norm_var": 0.014192708333333333, "learning_rate": 0.0001, "loss": 3.8242, "loss/crossentropy": 1.872189700603485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20009242743253708, "step": 28124 }, { "epoch": 0.56252, "grad_norm": 2.0625, "grad_norm_var": 0.015203603108723958, "learning_rate": 0.0001, "loss": 3.9798, "loss/crossentropy": 2.0979323983192444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19020158797502518, "step": 28126 }, { "epoch": 0.56256, "grad_norm": 2.0, "grad_norm_var": 0.010937245686848958, "learning_rate": 0.0001, "loss": 4.1726, "loss/crossentropy": 1.9983355402946472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18428711593151093, "step": 28128 }, { "epoch": 0.5626, "grad_norm": 2.109375, "grad_norm_var": 0.011864217122395833, "learning_rate": 0.0001, "loss": 4.0696, "loss/crossentropy": 1.784355342388153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23605111241340637, "step": 28130 }, { "epoch": 0.56264, "grad_norm": 1.9609375, "grad_norm_var": 0.01231689453125, "learning_rate": 0.0001, "loss": 3.932, "loss/crossentropy": 1.719801664352417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16245415434241295, "step": 28132 }, { "epoch": 0.56268, "grad_norm": 1.84375, "grad_norm_var": 0.00958251953125, "learning_rate": 0.0001, "loss": 3.8402, "loss/crossentropy": 1.8063556551933289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17704912275075912, "step": 28134 }, { "epoch": 0.56272, "grad_norm": 1.984375, "grad_norm_var": 0.007832845052083334, "learning_rate": 0.0001, "loss": 4.0469, "loss/crossentropy": 1.9270595908164978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1939307227730751, "step": 28136 }, { "epoch": 0.56276, "grad_norm": 1.8984375, "grad_norm_var": 0.008060709635416666, "learning_rate": 0.0001, "loss": 3.9754, "loss/crossentropy": 1.9221222400665283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1753731295466423, "step": 28138 }, { "epoch": 0.5628, "grad_norm": 1.765625, "grad_norm_var": 0.00994873046875, "learning_rate": 0.0001, "loss": 3.7678, "loss/crossentropy": 1.718525469303131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854087859392166, "step": 28140 }, { "epoch": 0.56284, "grad_norm": 2.015625, "grad_norm_var": 0.009200032552083333, "learning_rate": 0.0001, "loss": 4.0674, "loss/crossentropy": 1.9418652057647705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19125580787658691, "step": 28142 }, { "epoch": 0.56288, "grad_norm": 1.875, "grad_norm_var": 0.008587392171223958, "learning_rate": 0.0001, "loss": 4.1413, "loss/crossentropy": 2.100765645503998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19208013266324997, "step": 28144 }, { "epoch": 0.56292, "grad_norm": 1.8203125, "grad_norm_var": 0.010849761962890624, "learning_rate": 0.0001, "loss": 3.8, "loss/crossentropy": 1.9104883074760437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1662292778491974, "step": 28146 }, { "epoch": 0.56296, "grad_norm": 1.9296875, "grad_norm_var": 0.010092926025390626, "learning_rate": 0.0001, "loss": 3.8448, "loss/crossentropy": 2.059969484806061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18414444476366043, "step": 28148 }, { "epoch": 0.563, "grad_norm": 1.7890625, "grad_norm_var": 0.010392252604166667, "learning_rate": 0.0001, "loss": 3.8721, "loss/crossentropy": 1.9335008263587952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18294576555490494, "step": 28150 }, { "epoch": 0.56304, "grad_norm": 2.703125, "grad_norm_var": 0.05110575358072917, "learning_rate": 0.0001, "loss": 4.3341, "loss/crossentropy": 2.1855704188346863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19496884942054749, "step": 28152 }, { "epoch": 0.56308, "grad_norm": 2.140625, "grad_norm_var": 0.058176422119140626, "learning_rate": 0.0001, "loss": 4.2066, "loss/crossentropy": 2.1697583198547363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19855469465255737, "step": 28154 }, { "epoch": 0.56312, "grad_norm": 1.765625, "grad_norm_var": 0.058176422119140626, "learning_rate": 0.0001, "loss": 3.833, "loss/crossentropy": 1.926409661769867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18937255442142487, "step": 28156 }, { "epoch": 0.56316, "grad_norm": 1.9765625, "grad_norm_var": 0.059456125895182295, "learning_rate": 0.0001, "loss": 4.1422, "loss/crossentropy": 1.900004506111145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190530426800251, "step": 28158 }, { "epoch": 0.5632, "grad_norm": 1.96875, "grad_norm_var": 0.06049982706705729, "learning_rate": 0.0001, "loss": 4.0405, "loss/crossentropy": 1.8822137117385864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1802232339978218, "step": 28160 }, { "epoch": 0.56324, "grad_norm": 1.8203125, "grad_norm_var": 0.0831207275390625, "learning_rate": 0.0001, "loss": 3.9198, "loss/crossentropy": 2.088913321495056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068777084350586, "step": 28162 }, { "epoch": 0.56328, "grad_norm": 1.9921875, "grad_norm_var": 0.08334935506184896, "learning_rate": 0.0001, "loss": 3.967, "loss/crossentropy": 2.0997453927993774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21025776118040085, "step": 28164 }, { "epoch": 0.56332, "grad_norm": 1.875, "grad_norm_var": 0.08046061197916667, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 2.19319224357605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984698325395584, "step": 28166 }, { "epoch": 0.56336, "grad_norm": 1.9453125, "grad_norm_var": 0.04764989217122396, "learning_rate": 0.0001, "loss": 4.0751, "loss/crossentropy": 2.022126793861389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894361823797226, "step": 28168 }, { "epoch": 0.5634, "grad_norm": 1.8984375, "grad_norm_var": 0.04308242797851562, "learning_rate": 0.0001, "loss": 3.9752, "loss/crossentropy": 2.0495948791503906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18251951038837433, "step": 28170 }, { "epoch": 0.56344, "grad_norm": 1.8515625, "grad_norm_var": 0.04134928385416667, "learning_rate": 0.0001, "loss": 4.1206, "loss/crossentropy": 1.9946674704551697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18497124314308167, "step": 28172 }, { "epoch": 0.56348, "grad_norm": 1.9375, "grad_norm_var": 0.03991063435872396, "learning_rate": 0.0001, "loss": 3.6823, "loss/crossentropy": 1.970799744129181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20478563010692596, "step": 28174 }, { "epoch": 0.56352, "grad_norm": 1.9453125, "grad_norm_var": 0.039606730143229164, "learning_rate": 0.0001, "loss": 3.6153, "loss/crossentropy": 1.9632895588874817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1864907443523407, "step": 28176 }, { "epoch": 0.56356, "grad_norm": 1.8984375, "grad_norm_var": 0.005150349934895834, "learning_rate": 0.0001, "loss": 3.8629, "loss/crossentropy": 1.9328609108924866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19433043897151947, "step": 28178 }, { "epoch": 0.5636, "grad_norm": 1.8828125, "grad_norm_var": 0.004727935791015625, "learning_rate": 0.0001, "loss": 3.9304, "loss/crossentropy": 2.1020684242248535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18058212101459503, "step": 28180 }, { "epoch": 0.56364, "grad_norm": 1.8671875, "grad_norm_var": 0.0033322652180989582, "learning_rate": 0.0001, "loss": 3.8489, "loss/crossentropy": 2.008388578891754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189447820186615, "step": 28182 }, { "epoch": 0.56368, "grad_norm": 1.78125, "grad_norm_var": 0.004073079427083333, "learning_rate": 0.0001, "loss": 3.554, "loss/crossentropy": 1.790539801120758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16334787011146545, "step": 28184 }, { "epoch": 0.56372, "grad_norm": 2.015625, "grad_norm_var": 0.004439290364583333, "learning_rate": 0.0001, "loss": 3.7392, "loss/crossentropy": 1.9603918194770813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18403412401676178, "step": 28186 }, { "epoch": 0.56376, "grad_norm": 1.8203125, "grad_norm_var": 0.0047760009765625, "learning_rate": 0.0001, "loss": 3.7194, "loss/crossentropy": 2.04629784822464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18315870314836502, "step": 28188 }, { "epoch": 0.5638, "grad_norm": 1.765625, "grad_norm_var": 0.007950592041015624, "learning_rate": 0.0001, "loss": 3.8053, "loss/crossentropy": 2.2571674585342407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19582515209913254, "step": 28190 }, { "epoch": 0.56384, "grad_norm": 2.046875, "grad_norm_var": 0.008668772379557292, "learning_rate": 0.0001, "loss": 4.0982, "loss/crossentropy": 1.923226773738861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19587361812591553, "step": 28192 }, { "epoch": 0.56388, "grad_norm": 1.9140625, "grad_norm_var": 0.007527669270833333, "learning_rate": 0.0001, "loss": 4.0072, "loss/crossentropy": 2.167626738548279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20034413039684296, "step": 28194 }, { "epoch": 0.56392, "grad_norm": 1.9296875, "grad_norm_var": 0.007462310791015625, "learning_rate": 0.0001, "loss": 3.9982, "loss/crossentropy": 2.1942092180252075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21775193512439728, "step": 28196 }, { "epoch": 0.56396, "grad_norm": 2.0, "grad_norm_var": 0.007950592041015624, "learning_rate": 0.0001, "loss": 4.1772, "loss/crossentropy": 1.789182722568512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19279392063617706, "step": 28198 }, { "epoch": 0.564, "grad_norm": 1.8984375, "grad_norm_var": 0.0064656575520833336, "learning_rate": 0.0001, "loss": 4.1771, "loss/crossentropy": 1.8911787271499634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18866229057312012, "step": 28200 }, { "epoch": 0.56404, "grad_norm": 1.984375, "grad_norm_var": 0.006091054280598958, "learning_rate": 0.0001, "loss": 3.9862, "loss/crossentropy": 2.0467012524604797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21637209504842758, "step": 28202 }, { "epoch": 0.56408, "grad_norm": 2.15625, "grad_norm_var": 0.007541656494140625, "learning_rate": 0.0001, "loss": 3.9004, "loss/crossentropy": 2.183765947818756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19495883584022522, "step": 28204 }, { "epoch": 0.56412, "grad_norm": 1.9609375, "grad_norm_var": 0.004489898681640625, "learning_rate": 0.0001, "loss": 4.0853, "loss/crossentropy": 2.237957239151001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20132584869861603, "step": 28206 }, { "epoch": 0.56416, "grad_norm": 1.859375, "grad_norm_var": 0.005549112955729167, "learning_rate": 0.0001, "loss": 3.7728, "loss/crossentropy": 1.9930571913719177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18800070136785507, "step": 28208 }, { "epoch": 0.5642, "grad_norm": 1.984375, "grad_norm_var": 0.005915323893229167, "learning_rate": 0.0001, "loss": 3.8866, "loss/crossentropy": 1.9499656558036804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19010086357593536, "step": 28210 }, { "epoch": 0.56424, "grad_norm": 1.9609375, "grad_norm_var": 0.005708567301432292, "learning_rate": 0.0001, "loss": 4.1938, "loss/crossentropy": 2.115884840488434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19502697885036469, "step": 28212 }, { "epoch": 0.56428, "grad_norm": 1.8984375, "grad_norm_var": 0.005940755208333333, "learning_rate": 0.0001, "loss": 4.1859, "loss/crossentropy": 2.1863423585891724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202358990907669, "step": 28214 }, { "epoch": 0.56432, "grad_norm": 1.78125, "grad_norm_var": 0.007661946614583333, "learning_rate": 0.0001, "loss": 3.6467, "loss/crossentropy": 1.860621988773346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18784625828266144, "step": 28216 }, { "epoch": 0.56436, "grad_norm": 1.8984375, "grad_norm_var": 0.007338205973307292, "learning_rate": 0.0001, "loss": 4.055, "loss/crossentropy": 1.7891228199005127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822507157921791, "step": 28218 }, { "epoch": 0.5644, "grad_norm": 1.7734375, "grad_norm_var": 0.0050771077473958336, "learning_rate": 0.0001, "loss": 3.9942, "loss/crossentropy": 2.050603687763214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20126069337129593, "step": 28220 }, { "epoch": 0.56444, "grad_norm": 1.8125, "grad_norm_var": 0.005350748697916667, "learning_rate": 0.0001, "loss": 3.9056, "loss/crossentropy": 2.3879984617233276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19843067973852158, "step": 28222 }, { "epoch": 0.56448, "grad_norm": 1.953125, "grad_norm_var": 0.005158487955729167, "learning_rate": 0.0001, "loss": 4.0579, "loss/crossentropy": 2.0878185033798218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18253600597381592, "step": 28224 }, { "epoch": 0.56452, "grad_norm": 1.96875, "grad_norm_var": 0.0050432840983072914, "learning_rate": 0.0001, "loss": 3.9793, "loss/crossentropy": 2.1168408393859863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21165993809700012, "step": 28226 }, { "epoch": 0.56456, "grad_norm": 1.8203125, "grad_norm_var": 0.0061948140462239586, "learning_rate": 0.0001, "loss": 3.8824, "loss/crossentropy": 1.8579602241516113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18875548988580704, "step": 28228 }, { "epoch": 0.5646, "grad_norm": 1.7109375, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 3.7804, "loss/crossentropy": 2.0095511078834534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1720530390739441, "step": 28230 }, { "epoch": 0.56464, "grad_norm": 1.96875, "grad_norm_var": 0.0074765523274739586, "learning_rate": 0.0001, "loss": 3.841, "loss/crossentropy": 1.8599479794502258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17767678201198578, "step": 28232 }, { "epoch": 0.56468, "grad_norm": 1.9296875, "grad_norm_var": 0.007746378580729167, "learning_rate": 0.0001, "loss": 4.2437, "loss/crossentropy": 2.11702424287796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897270455956459, "step": 28234 }, { "epoch": 0.56472, "grad_norm": 1.9140625, "grad_norm_var": 0.006180826822916667, "learning_rate": 0.0001, "loss": 3.9554, "loss/crossentropy": 1.8956347703933716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1801730990409851, "step": 28236 }, { "epoch": 0.56476, "grad_norm": 1.953125, "grad_norm_var": 0.006766764322916666, "learning_rate": 0.0001, "loss": 3.9988, "loss/crossentropy": 2.044249713420868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17439774423837662, "step": 28238 }, { "epoch": 0.5648, "grad_norm": 1.8671875, "grad_norm_var": 0.006534576416015625, "learning_rate": 0.0001, "loss": 3.9429, "loss/crossentropy": 1.8406980633735657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18690447509288788, "step": 28240 }, { "epoch": 0.56484, "grad_norm": 1.84375, "grad_norm_var": 0.03523661295572917, "learning_rate": 0.0001, "loss": 3.6977, "loss/crossentropy": 1.8951767683029175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18628580123186111, "step": 28242 }, { "epoch": 0.56488, "grad_norm": 1.8046875, "grad_norm_var": 0.03559951782226563, "learning_rate": 0.0001, "loss": 3.638, "loss/crossentropy": 1.9350382685661316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19427261501550674, "step": 28244 }, { "epoch": 0.56492, "grad_norm": 1.9296875, "grad_norm_var": 0.032698567708333334, "learning_rate": 0.0001, "loss": 3.997, "loss/crossentropy": 2.370202422142029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2236226201057434, "step": 28246 }, { "epoch": 0.56496, "grad_norm": 1.84375, "grad_norm_var": 0.032163238525390624, "learning_rate": 0.0001, "loss": 4.0368, "loss/crossentropy": 2.1876412630081177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005016803741455, "step": 28248 }, { "epoch": 0.565, "grad_norm": 2.046875, "grad_norm_var": 0.0342193603515625, "learning_rate": 0.0001, "loss": 3.9678, "loss/crossentropy": 2.1329512000083923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20921162515878677, "step": 28250 }, { "epoch": 0.56504, "grad_norm": 2.203125, "grad_norm_var": 0.040036773681640624, "learning_rate": 0.0001, "loss": 3.8688, "loss/crossentropy": 1.8053449392318726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17606808245182037, "step": 28252 }, { "epoch": 0.56508, "grad_norm": 1.765625, "grad_norm_var": 0.04079360961914062, "learning_rate": 0.0001, "loss": 3.9478, "loss/crossentropy": 2.0177281498908997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19240254908800125, "step": 28254 }, { "epoch": 0.56512, "grad_norm": 1.953125, "grad_norm_var": 0.04112930297851562, "learning_rate": 0.0001, "loss": 3.9164, "loss/crossentropy": 1.9807183146476746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726047471165657, "step": 28256 }, { "epoch": 0.56516, "grad_norm": 1.9140625, "grad_norm_var": 0.012345377604166667, "learning_rate": 0.0001, "loss": 4.0006, "loss/crossentropy": 2.0992757081985474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19170843064785004, "step": 28258 }, { "epoch": 0.5652, "grad_norm": 1.828125, "grad_norm_var": 0.01220703125, "learning_rate": 0.0001, "loss": 3.6902, "loss/crossentropy": 1.865010380744934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18145494908094406, "step": 28260 }, { "epoch": 0.56524, "grad_norm": 1.9609375, "grad_norm_var": 0.012788899739583333, "learning_rate": 0.0001, "loss": 3.7781, "loss/crossentropy": 2.060899078845978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17389538139104843, "step": 28262 }, { "epoch": 0.56528, "grad_norm": 1.8046875, "grad_norm_var": 0.013911946614583334, "learning_rate": 0.0001, "loss": 3.8494, "loss/crossentropy": 1.7959403991699219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17851080000400543, "step": 28264 }, { "epoch": 0.56532, "grad_norm": 1.9140625, "grad_norm_var": 0.012410227457682292, "learning_rate": 0.0001, "loss": 3.8456, "loss/crossentropy": 2.169707775115967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19611390680074692, "step": 28266 }, { "epoch": 0.56536, "grad_norm": 2.1875, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0001, "loss": 4.1967, "loss/crossentropy": 2.322705864906311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728319883346558, "step": 28268 }, { "epoch": 0.5654, "grad_norm": 2.09375, "grad_norm_var": 0.013423665364583334, "learning_rate": 0.0001, "loss": 3.8118, "loss/crossentropy": 1.9398122429847717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19211900979280472, "step": 28270 }, { "epoch": 0.56544, "grad_norm": 1.890625, "grad_norm_var": 0.013435618082682291, "learning_rate": 0.0001, "loss": 3.9892, "loss/crossentropy": 1.7359251976013184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16433238983154297, "step": 28272 }, { "epoch": 0.56548, "grad_norm": 1.8125, "grad_norm_var": 0.014973958333333334, "learning_rate": 0.0001, "loss": 4.1366, "loss/crossentropy": 2.3107075691223145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127886563539505, "step": 28274 }, { "epoch": 0.56552, "grad_norm": 1.734375, "grad_norm_var": 0.01906305948893229, "learning_rate": 0.0001, "loss": 3.5307, "loss/crossentropy": 1.5368210673332214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15861748158931732, "step": 28276 }, { "epoch": 0.56556, "grad_norm": 1.90625, "grad_norm_var": 0.01902033487955729, "learning_rate": 0.0001, "loss": 4.2099, "loss/crossentropy": 2.145693838596344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863054633140564, "step": 28278 }, { "epoch": 0.5656, "grad_norm": 1.828125, "grad_norm_var": 0.018464152018229166, "learning_rate": 0.0001, "loss": 3.9928, "loss/crossentropy": 2.3174896240234375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882578581571579, "step": 28280 }, { "epoch": 0.56564, "grad_norm": 1.9296875, "grad_norm_var": 0.01803766886393229, "learning_rate": 0.0001, "loss": 4.2401, "loss/crossentropy": 2.3332719802856445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2089502438902855, "step": 28282 }, { "epoch": 0.56568, "grad_norm": 1.84375, "grad_norm_var": 0.008066558837890625, "learning_rate": 0.0001, "loss": 3.9758, "loss/crossentropy": 2.3516300916671753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21649855375289917, "step": 28284 }, { "epoch": 0.56572, "grad_norm": 1.8359375, "grad_norm_var": 0.0067138671875, "learning_rate": 0.0001, "loss": 3.9705, "loss/crossentropy": 1.9031076431274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18855175375938416, "step": 28286 }, { "epoch": 0.56576, "grad_norm": 1.96875, "grad_norm_var": 0.007100423177083333, "learning_rate": 0.0001, "loss": 4.1005, "loss/crossentropy": 1.5520136952400208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816611886024475, "step": 28288 }, { "epoch": 0.5658, "grad_norm": 1.9921875, "grad_norm_var": 0.006894683837890625, "learning_rate": 0.0001, "loss": 3.7001, "loss/crossentropy": 1.9759097695350647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19612447917461395, "step": 28290 }, { "epoch": 0.56584, "grad_norm": 1.8515625, "grad_norm_var": 0.005163319905598958, "learning_rate": 0.0001, "loss": 4.0389, "loss/crossentropy": 2.1442651748657227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18353651463985443, "step": 28292 }, { "epoch": 0.56588, "grad_norm": 2.03125, "grad_norm_var": 0.005914052327473958, "learning_rate": 0.0001, "loss": 4.0021, "loss/crossentropy": 1.8098361492156982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18010258674621582, "step": 28294 }, { "epoch": 0.56592, "grad_norm": 1.9140625, "grad_norm_var": 0.0052886962890625, "learning_rate": 0.0001, "loss": 3.9866, "loss/crossentropy": 2.2412387132644653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21006706357002258, "step": 28296 }, { "epoch": 0.56596, "grad_norm": 1.921875, "grad_norm_var": 0.006461334228515625, "learning_rate": 0.0001, "loss": 4.07, "loss/crossentropy": 1.927619457244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915675550699234, "step": 28298 }, { "epoch": 0.566, "grad_norm": 1.7734375, "grad_norm_var": 0.007502237955729167, "learning_rate": 0.0001, "loss": 3.9184, "loss/crossentropy": 2.317517042160034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19702450931072235, "step": 28300 }, { "epoch": 0.56604, "grad_norm": 1.984375, "grad_norm_var": 0.006514231363932292, "learning_rate": 0.0001, "loss": 4.0558, "loss/crossentropy": 2.02554988861084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20614077150821686, "step": 28302 }, { "epoch": 0.56608, "grad_norm": 1.875, "grad_norm_var": 0.006359608968098959, "learning_rate": 0.0001, "loss": 3.9733, "loss/crossentropy": 2.2094991207122803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18375249207019806, "step": 28304 }, { "epoch": 0.56612, "grad_norm": 1.8828125, "grad_norm_var": 0.0048601786295572914, "learning_rate": 0.0001, "loss": 3.956, "loss/crossentropy": 1.938852310180664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19518058001995087, "step": 28306 }, { "epoch": 0.56616, "grad_norm": 2.09375, "grad_norm_var": 0.0065081278483072914, "learning_rate": 0.0001, "loss": 4.0979, "loss/crossentropy": 2.0719003677368164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19132573902606964, "step": 28308 }, { "epoch": 0.5662, "grad_norm": 1.9140625, "grad_norm_var": 0.0059588114420572914, "learning_rate": 0.0001, "loss": 3.838, "loss/crossentropy": 2.01191109418869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19400015473365784, "step": 28310 }, { "epoch": 0.56624, "grad_norm": 1.875, "grad_norm_var": 0.005782063802083333, "learning_rate": 0.0001, "loss": 3.9532, "loss/crossentropy": 2.302769422531128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21603811532258987, "step": 28312 }, { "epoch": 0.56628, "grad_norm": 2.03125, "grad_norm_var": 0.006254069010416667, "learning_rate": 0.0001, "loss": 3.9965, "loss/crossentropy": 1.8251231908798218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1660621464252472, "step": 28314 }, { "epoch": 0.56632, "grad_norm": 1.8046875, "grad_norm_var": 0.0079254150390625, "learning_rate": 0.0001, "loss": 4.2141, "loss/crossentropy": 2.0656734704971313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19783338904380798, "step": 28316 }, { "epoch": 0.56636, "grad_norm": 1.9453125, "grad_norm_var": 0.008485666910807292, "learning_rate": 0.0001, "loss": 4.1956, "loss/crossentropy": 1.949280321598053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19571927189826965, "step": 28318 }, { "epoch": 0.5664, "grad_norm": 1.96875, "grad_norm_var": 0.008571116129557292, "learning_rate": 0.0001, "loss": 4.0139, "loss/crossentropy": 1.801173746585846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17420585453510284, "step": 28320 }, { "epoch": 0.56644, "grad_norm": 1.90625, "grad_norm_var": 0.008519490559895834, "learning_rate": 0.0001, "loss": 3.8094, "loss/crossentropy": 2.000474214553833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19203417748212814, "step": 28322 }, { "epoch": 0.56648, "grad_norm": 1.84375, "grad_norm_var": 0.006681315104166667, "learning_rate": 0.0001, "loss": 4.0042, "loss/crossentropy": 1.9475297331809998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19463783502578735, "step": 28324 }, { "epoch": 0.56652, "grad_norm": 1.9921875, "grad_norm_var": 0.005965169270833333, "learning_rate": 0.0001, "loss": 3.9425, "loss/crossentropy": 2.029974043369293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1769544631242752, "step": 28326 }, { "epoch": 0.56656, "grad_norm": 1.8828125, "grad_norm_var": 0.0058258056640625, "learning_rate": 0.0001, "loss": 3.995, "loss/crossentropy": 1.9521240592002869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189279705286026, "step": 28328 }, { "epoch": 0.5666, "grad_norm": 1.8984375, "grad_norm_var": 0.006136067708333333, "learning_rate": 0.0001, "loss": 4.1978, "loss/crossentropy": 2.128006398677826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22940929979085922, "step": 28330 }, { "epoch": 0.56664, "grad_norm": 1.78125, "grad_norm_var": 0.005234527587890625, "learning_rate": 0.0001, "loss": 3.9098, "loss/crossentropy": 2.3887888193130493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106098011136055, "step": 28332 }, { "epoch": 0.56668, "grad_norm": 1.78125, "grad_norm_var": 0.006639607747395833, "learning_rate": 0.0001, "loss": 3.9625, "loss/crossentropy": 1.9730157256126404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026069611310959, "step": 28334 }, { "epoch": 0.56672, "grad_norm": 2.015625, "grad_norm_var": 0.006856028238932292, "learning_rate": 0.0001, "loss": 4.0934, "loss/crossentropy": 2.384614109992981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21988043189048767, "step": 28336 }, { "epoch": 0.56676, "grad_norm": 2.046875, "grad_norm_var": 0.008388010660807292, "learning_rate": 0.0001, "loss": 3.9947, "loss/crossentropy": 1.843708097934723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18971703946590424, "step": 28338 }, { "epoch": 0.5668, "grad_norm": 2.0625, "grad_norm_var": 0.009897613525390625, "learning_rate": 0.0001, "loss": 3.953, "loss/crossentropy": 2.1084065437316895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19351261109113693, "step": 28340 }, { "epoch": 0.56684, "grad_norm": 1.8125, "grad_norm_var": 0.011321767171223959, "learning_rate": 0.0001, "loss": 3.742, "loss/crossentropy": 1.7598699927330017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18556873500347137, "step": 28342 }, { "epoch": 0.56688, "grad_norm": 1.9140625, "grad_norm_var": 0.0115875244140625, "learning_rate": 0.0001, "loss": 3.8072, "loss/crossentropy": 1.952016532421112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828347072005272, "step": 28344 }, { "epoch": 0.56692, "grad_norm": 1.9453125, "grad_norm_var": 0.010286458333333333, "learning_rate": 0.0001, "loss": 3.939, "loss/crossentropy": 1.9486799240112305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19223074615001678, "step": 28346 }, { "epoch": 0.56696, "grad_norm": 1.9140625, "grad_norm_var": 0.009197743733723958, "learning_rate": 0.0001, "loss": 3.9421, "loss/crossentropy": 1.9944785237312317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19656967371702194, "step": 28348 }, { "epoch": 0.567, "grad_norm": 1.9609375, "grad_norm_var": 0.007911936442057291, "learning_rate": 0.0001, "loss": 3.5903, "loss/crossentropy": 2.055353820323944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19104135036468506, "step": 28350 }, { "epoch": 0.56704, "grad_norm": 1.8984375, "grad_norm_var": 0.007096354166666667, "learning_rate": 0.0001, "loss": 3.8961, "loss/crossentropy": 1.9901525974273682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20019636303186417, "step": 28352 }, { "epoch": 0.56708, "grad_norm": 1.9140625, "grad_norm_var": 0.005418904622395833, "learning_rate": 0.0001, "loss": 4.0514, "loss/crossentropy": 2.246248483657837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20701756328344345, "step": 28354 }, { "epoch": 0.56712, "grad_norm": 2.234375, "grad_norm_var": 0.011136881510416667, "learning_rate": 0.0001, "loss": 3.8456, "loss/crossentropy": 1.9263428449630737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19890911877155304, "step": 28356 }, { "epoch": 0.56716, "grad_norm": 1.9609375, "grad_norm_var": 0.009338124593098959, "learning_rate": 0.0001, "loss": 4.0555, "loss/crossentropy": 2.2119653820991516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20472905784845352, "step": 28358 }, { "epoch": 0.5672, "grad_norm": 2.09375, "grad_norm_var": 0.010406239827473959, "learning_rate": 0.0001, "loss": 4.2023, "loss/crossentropy": 2.1126968264579773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18353600800037384, "step": 28360 }, { "epoch": 0.56724, "grad_norm": 2.03125, "grad_norm_var": 0.010929361979166666, "learning_rate": 0.0001, "loss": 3.8412, "loss/crossentropy": 1.7391886115074158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1703544184565544, "step": 28362 }, { "epoch": 0.56728, "grad_norm": 1.9140625, "grad_norm_var": 0.010689036051432291, "learning_rate": 0.0001, "loss": 3.8372, "loss/crossentropy": 1.8684263229370117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18913953006267548, "step": 28364 }, { "epoch": 0.56732, "grad_norm": 1.828125, "grad_norm_var": 0.010699208577473958, "learning_rate": 0.0001, "loss": 3.7079, "loss/crossentropy": 1.971863567829132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188376322388649, "step": 28366 }, { "epoch": 0.56736, "grad_norm": 2.015625, "grad_norm_var": 0.010827382405598959, "learning_rate": 0.0001, "loss": 4.1637, "loss/crossentropy": 1.8890693187713623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18029199540615082, "step": 28368 }, { "epoch": 0.5674, "grad_norm": 2.109375, "grad_norm_var": 0.011861165364583334, "learning_rate": 0.0001, "loss": 4.2677, "loss/crossentropy": 1.9506273865699768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19081934541463852, "step": 28370 }, { "epoch": 0.56744, "grad_norm": 2.03125, "grad_norm_var": 0.006463368733723958, "learning_rate": 0.0001, "loss": 4.0055, "loss/crossentropy": 1.937731921672821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21456565707921982, "step": 28372 }, { "epoch": 0.56748, "grad_norm": 1.859375, "grad_norm_var": 0.006891886393229167, "learning_rate": 0.0001, "loss": 4.0904, "loss/crossentropy": 2.0715887546539307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21278052031993866, "step": 28374 }, { "epoch": 0.56752, "grad_norm": 1.7890625, "grad_norm_var": 0.007413736979166667, "learning_rate": 0.0001, "loss": 3.9901, "loss/crossentropy": 2.2458627223968506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20296980440616608, "step": 28376 }, { "epoch": 0.56756, "grad_norm": 1.8984375, "grad_norm_var": 0.006886545817057292, "learning_rate": 0.0001, "loss": 4.178, "loss/crossentropy": 2.3213919401168823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19989027082920074, "step": 28378 }, { "epoch": 0.5676, "grad_norm": 1.921875, "grad_norm_var": 0.007500966389973958, "learning_rate": 0.0001, "loss": 4.2319, "loss/crossentropy": 2.543839931488037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22476375102996826, "step": 28380 }, { "epoch": 0.56764, "grad_norm": 1.984375, "grad_norm_var": 0.006084950764973959, "learning_rate": 0.0001, "loss": 3.9426, "loss/crossentropy": 2.1060808897018433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993144229054451, "step": 28382 }, { "epoch": 0.56768, "grad_norm": 1.9375, "grad_norm_var": 0.0074859619140625, "learning_rate": 0.0001, "loss": 3.8902, "loss/crossentropy": 1.9778028726577759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18509671837091446, "step": 28384 }, { "epoch": 0.56772, "grad_norm": 2.203125, "grad_norm_var": 0.010445149739583333, "learning_rate": 0.0001, "loss": 3.7404, "loss/crossentropy": 1.7300578951835632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17200996726751328, "step": 28386 }, { "epoch": 0.56776, "grad_norm": 1.7734375, "grad_norm_var": 0.011352284749348959, "learning_rate": 0.0001, "loss": 3.8029, "loss/crossentropy": 1.87715744972229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1691356599330902, "step": 28388 }, { "epoch": 0.5678, "grad_norm": 1.7578125, "grad_norm_var": 0.01263427734375, "learning_rate": 0.0001, "loss": 3.7688, "loss/crossentropy": 1.9529326558113098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18868190050125122, "step": 28390 }, { "epoch": 0.56784, "grad_norm": 1.84375, "grad_norm_var": 0.012654622395833334, "learning_rate": 0.0001, "loss": 3.9265, "loss/crossentropy": 1.9922877550125122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18494701385498047, "step": 28392 }, { "epoch": 0.56788, "grad_norm": 2.046875, "grad_norm_var": 0.014012654622395834, "learning_rate": 0.0001, "loss": 4.0745, "loss/crossentropy": 2.1247864961624146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20046664774417877, "step": 28394 }, { "epoch": 0.56792, "grad_norm": 1.796875, "grad_norm_var": 0.019334920247395835, "learning_rate": 0.0001, "loss": 3.9056, "loss/crossentropy": 2.106461524963379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1846928596496582, "step": 28396 }, { "epoch": 0.56796, "grad_norm": 1.8203125, "grad_norm_var": 0.02042210896809896, "learning_rate": 0.0001, "loss": 4.0709, "loss/crossentropy": 2.5050116777420044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23650984466075897, "step": 28398 }, { "epoch": 0.568, "grad_norm": 1.9765625, "grad_norm_var": 0.01956151326497396, "learning_rate": 0.0001, "loss": 3.9026, "loss/crossentropy": 1.8201875686645508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18172843009233475, "step": 28400 }, { "epoch": 0.56804, "grad_norm": 1.78125, "grad_norm_var": 0.014680735270182292, "learning_rate": 0.0001, "loss": 3.7716, "loss/crossentropy": 1.748970091342926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1714978739619255, "step": 28402 }, { "epoch": 0.56808, "grad_norm": 1.8984375, "grad_norm_var": 0.013703409830729167, "learning_rate": 0.0001, "loss": 3.9609, "loss/crossentropy": 2.027329981327057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16597037762403488, "step": 28404 }, { "epoch": 0.56812, "grad_norm": 1.859375, "grad_norm_var": 0.012400054931640625, "learning_rate": 0.0001, "loss": 3.8902, "loss/crossentropy": 1.8651673197746277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17772310227155685, "step": 28406 }, { "epoch": 0.56816, "grad_norm": 1.8671875, "grad_norm_var": 0.011869049072265625, "learning_rate": 0.0001, "loss": 4.0313, "loss/crossentropy": 2.1574735045433044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2000775933265686, "step": 28408 }, { "epoch": 0.5682, "grad_norm": 1.8515625, "grad_norm_var": 0.010861968994140625, "learning_rate": 0.0001, "loss": 4.1171, "loss/crossentropy": 2.1159602403640747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19108104705810547, "step": 28410 }, { "epoch": 0.56824, "grad_norm": 2.171875, "grad_norm_var": 0.008829498291015625, "learning_rate": 0.0001, "loss": 4.229, "loss/crossentropy": 2.203667163848877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056882083415985, "step": 28412 }, { "epoch": 0.56828, "grad_norm": 1.9296875, "grad_norm_var": 0.008381907145182292, "learning_rate": 0.0001, "loss": 4.0476, "loss/crossentropy": 2.0054327845573425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914311721920967, "step": 28414 }, { "epoch": 0.56832, "grad_norm": 1.8984375, "grad_norm_var": 0.008489735921223958, "learning_rate": 0.0001, "loss": 4.1249, "loss/crossentropy": 2.199601411819458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020196169614792, "step": 28416 }, { "epoch": 0.56836, "grad_norm": 2.015625, "grad_norm_var": 0.007682291666666666, "learning_rate": 0.0001, "loss": 4.1068, "loss/crossentropy": 1.84443199634552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17476340383291245, "step": 28418 }, { "epoch": 0.5684, "grad_norm": 1.9140625, "grad_norm_var": 0.007624308268229167, "learning_rate": 0.0001, "loss": 3.9032, "loss/crossentropy": 2.2852976322174072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21395886689424515, "step": 28420 }, { "epoch": 0.56844, "grad_norm": 2.046875, "grad_norm_var": 0.009956614176432291, "learning_rate": 0.0001, "loss": 4.0763, "loss/crossentropy": 2.2443678975105286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19366547465324402, "step": 28422 }, { "epoch": 0.56848, "grad_norm": 1.953125, "grad_norm_var": 0.012784830729166667, "learning_rate": 0.0001, "loss": 4.2711, "loss/crossentropy": 2.055034577846527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914302259683609, "step": 28424 }, { "epoch": 0.56852, "grad_norm": 1.875, "grad_norm_var": 0.012245432535807291, "learning_rate": 0.0001, "loss": 4.0782, "loss/crossentropy": 2.33309006690979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132418155670166, "step": 28426 }, { "epoch": 0.56856, "grad_norm": 1.9140625, "grad_norm_var": 0.009246571858723959, "learning_rate": 0.0001, "loss": 4.2069, "loss/crossentropy": 2.4401670694351196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20467859506607056, "step": 28428 }, { "epoch": 0.5686, "grad_norm": 1.9375, "grad_norm_var": 0.015778605143229166, "learning_rate": 0.0001, "loss": 3.9526, "loss/crossentropy": 1.8660191297531128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23643852770328522, "step": 28430 }, { "epoch": 0.56864, "grad_norm": 2.109375, "grad_norm_var": 0.016080729166666665, "learning_rate": 0.0001, "loss": 4.1835, "loss/crossentropy": 2.026413381099701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25309374928474426, "step": 28432 }, { "epoch": 0.56868, "grad_norm": 1.8203125, "grad_norm_var": 0.01834691365559896, "learning_rate": 0.0001, "loss": 4.0417, "loss/crossentropy": 2.1234898567199707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19204584509134293, "step": 28434 }, { "epoch": 0.56872, "grad_norm": 2.046875, "grad_norm_var": 0.017292277018229166, "learning_rate": 0.0001, "loss": 4.1999, "loss/crossentropy": 2.3347290754318237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21934974193572998, "step": 28436 }, { "epoch": 0.56876, "grad_norm": 1.828125, "grad_norm_var": 0.016076405843098957, "learning_rate": 0.0001, "loss": 3.8796, "loss/crossentropy": 2.0287744402885437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1834561824798584, "step": 28438 }, { "epoch": 0.5688, "grad_norm": 1.8671875, "grad_norm_var": 0.014654286702473958, "learning_rate": 0.0001, "loss": 4.0872, "loss/crossentropy": 1.782648503780365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810266673564911, "step": 28440 }, { "epoch": 0.56884, "grad_norm": 1.8125, "grad_norm_var": 0.016139475504557292, "learning_rate": 0.0001, "loss": 4.0876, "loss/crossentropy": 2.2251389622688293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041720598936081, "step": 28442 }, { "epoch": 0.56888, "grad_norm": 1.7734375, "grad_norm_var": 0.018846638997395835, "learning_rate": 0.0001, "loss": 3.7876, "loss/crossentropy": 1.9368031024932861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828494518995285, "step": 28444 }, { "epoch": 0.56892, "grad_norm": 1.890625, "grad_norm_var": 0.011763254801432291, "learning_rate": 0.0001, "loss": 3.9775, "loss/crossentropy": 2.0573925971984863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192618265748024, "step": 28446 }, { "epoch": 0.56896, "grad_norm": 1.90625, "grad_norm_var": 0.0088775634765625, "learning_rate": 0.0001, "loss": 3.8321, "loss/crossentropy": 1.9373971223831177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20026732236146927, "step": 28448 }, { "epoch": 0.569, "grad_norm": 1.8203125, "grad_norm_var": 0.007391103108723958, "learning_rate": 0.0001, "loss": 3.8771, "loss/crossentropy": 1.877286970615387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17163537442684174, "step": 28450 }, { "epoch": 0.56904, "grad_norm": 1.9140625, "grad_norm_var": 0.0054931640625, "learning_rate": 0.0001, "loss": 3.8502, "loss/crossentropy": 2.1928617358207703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20396625250577927, "step": 28452 }, { "epoch": 0.56908, "grad_norm": 1.9609375, "grad_norm_var": 0.008373006184895834, "learning_rate": 0.0001, "loss": 3.9737, "loss/crossentropy": 2.087548613548279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2330121174454689, "step": 28454 }, { "epoch": 0.56912, "grad_norm": 1.921875, "grad_norm_var": 0.008275349934895834, "learning_rate": 0.0001, "loss": 4.24, "loss/crossentropy": 2.0506752729415894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19728151708841324, "step": 28456 }, { "epoch": 0.56916, "grad_norm": 1.9453125, "grad_norm_var": 0.006174468994140625, "learning_rate": 0.0001, "loss": 4.1038, "loss/crossentropy": 2.3582634925842285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21210015565156937, "step": 28458 }, { "epoch": 0.5692, "grad_norm": 1.9453125, "grad_norm_var": 0.0051422119140625, "learning_rate": 0.0001, "loss": 3.8762, "loss/crossentropy": 2.3920403718948364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20361161977052689, "step": 28460 }, { "epoch": 0.56924, "grad_norm": 1.953125, "grad_norm_var": 0.0048248291015625, "learning_rate": 0.0001, "loss": 3.9837, "loss/crossentropy": 2.077300548553467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18526732921600342, "step": 28462 }, { "epoch": 0.56928, "grad_norm": 1.828125, "grad_norm_var": 0.0059967041015625, "learning_rate": 0.0001, "loss": 3.7625, "loss/crossentropy": 2.0746008157730103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.176588736474514, "step": 28464 }, { "epoch": 0.56932, "grad_norm": 1.8671875, "grad_norm_var": 0.005008697509765625, "learning_rate": 0.0001, "loss": 4.0275, "loss/crossentropy": 2.651300311088562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22577688097953796, "step": 28466 }, { "epoch": 0.56936, "grad_norm": 1.8203125, "grad_norm_var": 0.006477864583333334, "learning_rate": 0.0001, "loss": 3.9464, "loss/crossentropy": 2.169211745262146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20319853723049164, "step": 28468 }, { "epoch": 0.5694, "grad_norm": 1.7109375, "grad_norm_var": 0.005673980712890625, "learning_rate": 0.0001, "loss": 3.7715, "loss/crossentropy": 2.16380512714386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928708478808403, "step": 28470 }, { "epoch": 0.56944, "grad_norm": 1.890625, "grad_norm_var": 0.0054094950358072914, "learning_rate": 0.0001, "loss": 3.8378, "loss/crossentropy": 1.780498206615448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736089140176773, "step": 28472 }, { "epoch": 0.56948, "grad_norm": 1.9765625, "grad_norm_var": 0.005597941080729167, "learning_rate": 0.0001, "loss": 4.1348, "loss/crossentropy": 1.9082182049751282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17953625321388245, "step": 28474 }, { "epoch": 0.56952, "grad_norm": 1.984375, "grad_norm_var": 0.005499013264973958, "learning_rate": 0.0001, "loss": 3.8997, "loss/crossentropy": 1.6533525586128235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16805779933929443, "step": 28476 }, { "epoch": 0.56956, "grad_norm": 2.03125, "grad_norm_var": 0.006882476806640625, "learning_rate": 0.0001, "loss": 4.0518, "loss/crossentropy": 2.0916685461997986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956561803817749, "step": 28478 }, { "epoch": 0.5696, "grad_norm": 2.03125, "grad_norm_var": 0.0078277587890625, "learning_rate": 0.0001, "loss": 4.1282, "loss/crossentropy": 1.9846032857894897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17997410148382187, "step": 28480 }, { "epoch": 0.56964, "grad_norm": 1.9765625, "grad_norm_var": 0.008396148681640625, "learning_rate": 0.0001, "loss": 4.0715, "loss/crossentropy": 1.9905300736427307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17957545816898346, "step": 28482 }, { "epoch": 0.56968, "grad_norm": 1.9453125, "grad_norm_var": 0.007356516520182292, "learning_rate": 0.0001, "loss": 3.7545, "loss/crossentropy": 1.9274362921714783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20025887340307236, "step": 28484 }, { "epoch": 0.56972, "grad_norm": 1.953125, "grad_norm_var": 0.003981272379557292, "learning_rate": 0.0001, "loss": 3.8606, "loss/crossentropy": 1.7590890526771545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17347681522369385, "step": 28486 }, { "epoch": 0.56976, "grad_norm": 1.9453125, "grad_norm_var": 0.003979237874348959, "learning_rate": 0.0001, "loss": 3.9737, "loss/crossentropy": 1.8959112763404846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18316560983657837, "step": 28488 }, { "epoch": 0.5698, "grad_norm": 2.046875, "grad_norm_var": 0.004292805989583333, "learning_rate": 0.0001, "loss": 3.9393, "loss/crossentropy": 2.108347535133362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20547642558813095, "step": 28490 }, { "epoch": 0.56984, "grad_norm": 1.90625, "grad_norm_var": 0.003482818603515625, "learning_rate": 0.0001, "loss": 3.8703, "loss/crossentropy": 1.9661864638328552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1860649138689041, "step": 28492 }, { "epoch": 0.56988, "grad_norm": 1.9296875, "grad_norm_var": 0.0029111226399739583, "learning_rate": 0.0001, "loss": 3.8467, "loss/crossentropy": 1.7892447710037231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16265498101711273, "step": 28494 }, { "epoch": 0.56992, "grad_norm": 1.84375, "grad_norm_var": 0.0030507405598958334, "learning_rate": 0.0001, "loss": 4.1363, "loss/crossentropy": 2.2188740968704224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20954828709363937, "step": 28496 }, { "epoch": 0.56996, "grad_norm": 2.03125, "grad_norm_var": 0.005028279622395834, "learning_rate": 0.0001, "loss": 4.1339, "loss/crossentropy": 2.1535292863845825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19683580845594406, "step": 28498 }, { "epoch": 0.57, "grad_norm": 1.90625, "grad_norm_var": 0.0048258463541666664, "learning_rate": 0.0001, "loss": 3.8687, "loss/crossentropy": 2.170483350753784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20434408634901047, "step": 28500 }, { "epoch": 0.57004, "grad_norm": 1.9375, "grad_norm_var": 0.004735310872395833, "learning_rate": 0.0001, "loss": 3.9846, "loss/crossentropy": 1.9062520265579224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18132874369621277, "step": 28502 }, { "epoch": 0.57008, "grad_norm": 2.0, "grad_norm_var": 0.005014801025390625, "learning_rate": 0.0001, "loss": 3.9361, "loss/crossentropy": 1.9402993321418762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1883842498064041, "step": 28504 }, { "epoch": 0.57012, "grad_norm": 2.25, "grad_norm_var": 0.010516103108723958, "learning_rate": 0.0001, "loss": 4.2954, "loss/crossentropy": 2.1244900226593018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19686578959226608, "step": 28506 }, { "epoch": 0.57016, "grad_norm": 1.96875, "grad_norm_var": 0.010432688395182292, "learning_rate": 0.0001, "loss": 4.213, "loss/crossentropy": 2.255519151687622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19818376004695892, "step": 28508 }, { "epoch": 0.5702, "grad_norm": 1.90625, "grad_norm_var": 0.017838541666666666, "learning_rate": 0.0001, "loss": 4.3414, "loss/crossentropy": 1.69990873336792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16047322005033493, "step": 28510 }, { "epoch": 0.57024, "grad_norm": 1.859375, "grad_norm_var": 0.01759618123372396, "learning_rate": 0.0001, "loss": 4.0451, "loss/crossentropy": 2.0938282012939453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19069606065750122, "step": 28512 }, { "epoch": 0.57028, "grad_norm": 1.84375, "grad_norm_var": 0.015746053059895834, "learning_rate": 0.0001, "loss": 3.9285, "loss/crossentropy": 2.119936943054199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20535118132829666, "step": 28514 }, { "epoch": 0.57032, "grad_norm": 1.9453125, "grad_norm_var": 0.015364583333333333, "learning_rate": 0.0001, "loss": 4.1106, "loss/crossentropy": 2.124122440814972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19785825163125992, "step": 28516 }, { "epoch": 0.57036, "grad_norm": 1.765625, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 4.025, "loss/crossentropy": 2.275521993637085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197978176176548, "step": 28518 }, { "epoch": 0.5704, "grad_norm": 1.75, "grad_norm_var": 0.021089680989583335, "learning_rate": 0.0001, "loss": 3.8305, "loss/crossentropy": 1.9788197875022888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17468783259391785, "step": 28520 }, { "epoch": 0.57044, "grad_norm": 1.984375, "grad_norm_var": 0.015799713134765626, "learning_rate": 0.0001, "loss": 3.9497, "loss/crossentropy": 1.9069678783416748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17532987147569656, "step": 28522 }, { "epoch": 0.57048, "grad_norm": 1.9375, "grad_norm_var": 0.015705362955729166, "learning_rate": 0.0001, "loss": 3.9084, "loss/crossentropy": 1.7438762784004211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15763580799102783, "step": 28524 }, { "epoch": 0.57052, "grad_norm": 2.0625, "grad_norm_var": 0.008280436197916666, "learning_rate": 0.0001, "loss": 4.0142, "loss/crossentropy": 2.0234435200691223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17508037388324738, "step": 28526 }, { "epoch": 0.57056, "grad_norm": 2.0, "grad_norm_var": 0.008571116129557292, "learning_rate": 0.0001, "loss": 4.1993, "loss/crossentropy": 2.2856411933898926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183924585580826, "step": 28528 }, { "epoch": 0.5706, "grad_norm": 2.0625, "grad_norm_var": 0.010358683268229167, "learning_rate": 0.0001, "loss": 3.9569, "loss/crossentropy": 2.201618194580078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216714471578598, "step": 28530 }, { "epoch": 0.57064, "grad_norm": 1.84375, "grad_norm_var": 0.011348215738932292, "learning_rate": 0.0001, "loss": 4.017, "loss/crossentropy": 1.9202881455421448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18147829920053482, "step": 28532 }, { "epoch": 0.57068, "grad_norm": 1.9765625, "grad_norm_var": 0.009733072916666667, "learning_rate": 0.0001, "loss": 3.7967, "loss/crossentropy": 1.5407178401947021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14478188008069992, "step": 28534 }, { "epoch": 0.57072, "grad_norm": 2.09375, "grad_norm_var": 0.010920206705729166, "learning_rate": 0.0001, "loss": 3.8766, "loss/crossentropy": 1.836295247077942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18802417069673538, "step": 28536 }, { "epoch": 0.57076, "grad_norm": 1.9609375, "grad_norm_var": 0.013553619384765625, "learning_rate": 0.0001, "loss": 4.1791, "loss/crossentropy": 2.043636739253998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19008664041757584, "step": 28538 }, { "epoch": 0.5708, "grad_norm": 2.078125, "grad_norm_var": 0.014745076497395834, "learning_rate": 0.0001, "loss": 4.1531, "loss/crossentropy": 1.8939302563667297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18854600936174393, "step": 28540 }, { "epoch": 0.57084, "grad_norm": 1.953125, "grad_norm_var": 0.012658437093098959, "learning_rate": 0.0001, "loss": 4.0664, "loss/crossentropy": 2.1033613085746765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17389168590307236, "step": 28542 }, { "epoch": 0.57088, "grad_norm": 2.0625, "grad_norm_var": 0.013230133056640624, "learning_rate": 0.0001, "loss": 4.1976, "loss/crossentropy": 2.245096266269684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21308264881372452, "step": 28544 }, { "epoch": 0.57092, "grad_norm": 1.796875, "grad_norm_var": 0.012910715738932292, "learning_rate": 0.0001, "loss": 3.8638, "loss/crossentropy": 1.6646681427955627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16192668676376343, "step": 28546 }, { "epoch": 0.57096, "grad_norm": 1.78125, "grad_norm_var": 0.011922200520833334, "learning_rate": 0.0001, "loss": 3.8962, "loss/crossentropy": 2.29884135723114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17613109946250916, "step": 28548 }, { "epoch": 0.571, "grad_norm": 1.890625, "grad_norm_var": 0.01628392537434896, "learning_rate": 0.0001, "loss": 4.0125, "loss/crossentropy": 2.10237193107605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1935042291879654, "step": 28550 }, { "epoch": 0.57104, "grad_norm": 1.9921875, "grad_norm_var": 0.015048980712890625, "learning_rate": 0.0001, "loss": 4.0384, "loss/crossentropy": 2.1097005009651184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1907399743795395, "step": 28552 }, { "epoch": 0.57108, "grad_norm": 1.9140625, "grad_norm_var": 0.012824503580729167, "learning_rate": 0.0001, "loss": 4.155, "loss/crossentropy": 2.1058011054992676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19711777567863464, "step": 28554 }, { "epoch": 0.57112, "grad_norm": 1.9375, "grad_norm_var": 0.011659495035807292, "learning_rate": 0.0001, "loss": 3.9603, "loss/crossentropy": 2.0466136932373047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18761292099952698, "step": 28556 }, { "epoch": 0.57116, "grad_norm": 1.96875, "grad_norm_var": 0.011759440104166666, "learning_rate": 0.0001, "loss": 3.6357, "loss/crossentropy": 1.6006923913955688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17703057825565338, "step": 28558 }, { "epoch": 0.5712, "grad_norm": 2.015625, "grad_norm_var": 0.011183420817057291, "learning_rate": 0.0001, "loss": 3.8007, "loss/crossentropy": 1.884728193283081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18712394684553146, "step": 28560 }, { "epoch": 0.57124, "grad_norm": 1.953125, "grad_norm_var": 0.009718577067057291, "learning_rate": 0.0001, "loss": 3.8565, "loss/crossentropy": 1.8759589791297913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19130942225456238, "step": 28562 }, { "epoch": 0.57128, "grad_norm": 1.8515625, "grad_norm_var": 0.009785715738932292, "learning_rate": 0.0001, "loss": 4.1392, "loss/crossentropy": 1.9973544478416443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18966401368379593, "step": 28564 }, { "epoch": 0.57132, "grad_norm": 1.75, "grad_norm_var": 0.007368723551432292, "learning_rate": 0.0001, "loss": 3.9515, "loss/crossentropy": 2.1246695518493652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938970983028412, "step": 28566 }, { "epoch": 0.57136, "grad_norm": 1.9375, "grad_norm_var": 0.0065093994140625, "learning_rate": 0.0001, "loss": 3.7797, "loss/crossentropy": 1.5528571605682373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15729191899299622, "step": 28568 }, { "epoch": 0.5714, "grad_norm": 1.984375, "grad_norm_var": 0.0068023681640625, "learning_rate": 0.0001, "loss": 3.995, "loss/crossentropy": 2.0078552961349487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19063813984394073, "step": 28570 }, { "epoch": 0.57144, "grad_norm": 1.890625, "grad_norm_var": 0.0072021484375, "learning_rate": 0.0001, "loss": 4.0105, "loss/crossentropy": 1.8582743406295776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16606975346803665, "step": 28572 }, { "epoch": 0.57148, "grad_norm": 1.8125, "grad_norm_var": 0.008784739176432292, "learning_rate": 0.0001, "loss": 3.7379, "loss/crossentropy": 2.1142138242721558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289004802703857, "step": 28574 }, { "epoch": 0.57152, "grad_norm": 1.859375, "grad_norm_var": 0.008147939046223959, "learning_rate": 0.0001, "loss": 3.9771, "loss/crossentropy": 2.0852617621421814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18875133991241455, "step": 28576 }, { "epoch": 0.57156, "grad_norm": 1.9765625, "grad_norm_var": 0.007541656494140625, "learning_rate": 0.0001, "loss": 4.0243, "loss/crossentropy": 2.1352654695510864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19411734491586685, "step": 28578 }, { "epoch": 0.5716, "grad_norm": 1.8671875, "grad_norm_var": 0.0069048563639322914, "learning_rate": 0.0001, "loss": 3.5985, "loss/crossentropy": 1.7001954317092896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17129848897457123, "step": 28580 }, { "epoch": 0.57164, "grad_norm": 1.8515625, "grad_norm_var": 0.004894765218098959, "learning_rate": 0.0001, "loss": 3.7744, "loss/crossentropy": 1.8788430094718933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17039231956005096, "step": 28582 }, { "epoch": 0.57168, "grad_norm": 1.9765625, "grad_norm_var": 0.005307769775390625, "learning_rate": 0.0001, "loss": 4.0533, "loss/crossentropy": 1.9935740232467651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18447662144899368, "step": 28584 }, { "epoch": 0.57172, "grad_norm": 1.890625, "grad_norm_var": 0.0067779541015625, "learning_rate": 0.0001, "loss": 3.652, "loss/crossentropy": 1.6966716051101685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17215370386838913, "step": 28586 }, { "epoch": 0.57176, "grad_norm": 1.8359375, "grad_norm_var": 0.006841786702473958, "learning_rate": 0.0001, "loss": 3.6792, "loss/crossentropy": 1.8064610958099365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18994714319705963, "step": 28588 }, { "epoch": 0.5718, "grad_norm": 1.7890625, "grad_norm_var": 0.006956990559895833, "learning_rate": 0.0001, "loss": 3.8408, "loss/crossentropy": 1.8688351511955261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17651986330747604, "step": 28590 }, { "epoch": 0.57184, "grad_norm": 2.09375, "grad_norm_var": 0.023482259114583334, "learning_rate": 0.0001, "loss": 3.9822, "loss/crossentropy": 1.9788597226142883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21657729148864746, "step": 28592 }, { "epoch": 0.57188, "grad_norm": 2.125, "grad_norm_var": 0.02662938435872396, "learning_rate": 0.0001, "loss": 4.2794, "loss/crossentropy": 2.2449204325675964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20486489683389664, "step": 28594 }, { "epoch": 0.57192, "grad_norm": 1.875, "grad_norm_var": 0.02296727498372396, "learning_rate": 0.0001, "loss": 3.9333, "loss/crossentropy": 1.9045695662498474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19815212488174438, "step": 28596 }, { "epoch": 0.57196, "grad_norm": 1.8125, "grad_norm_var": 0.025742340087890624, "learning_rate": 0.0001, "loss": 3.7332, "loss/crossentropy": 1.6905238628387451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1561586558818817, "step": 28598 }, { "epoch": 0.572, "grad_norm": 2.1875, "grad_norm_var": 0.03126220703125, "learning_rate": 0.0001, "loss": 3.7146, "loss/crossentropy": 1.788037657737732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17239058762788773, "step": 28600 }, { "epoch": 0.57204, "grad_norm": 2.046875, "grad_norm_var": 0.02777684529622396, "learning_rate": 0.0001, "loss": 4.0529, "loss/crossentropy": 1.9274045825004578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20560631155967712, "step": 28602 }, { "epoch": 0.57208, "grad_norm": 1.7734375, "grad_norm_var": 0.028820546468098958, "learning_rate": 0.0001, "loss": 3.8056, "loss/crossentropy": 1.909210741519928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17594803869724274, "step": 28604 }, { "epoch": 0.57212, "grad_norm": 2.125, "grad_norm_var": 0.028362782796223958, "learning_rate": 0.0001, "loss": 3.9912, "loss/crossentropy": 2.0221698880195618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869770884513855, "step": 28606 }, { "epoch": 0.57216, "grad_norm": 1.9375, "grad_norm_var": 0.01813329060872396, "learning_rate": 0.0001, "loss": 4.184, "loss/crossentropy": 2.235592007637024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17893031239509583, "step": 28608 }, { "epoch": 0.5722, "grad_norm": 1.828125, "grad_norm_var": 0.028303019205729165, "learning_rate": 0.0001, "loss": 4.0714, "loss/crossentropy": 2.098126530647278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068413719534874, "step": 28610 }, { "epoch": 0.57224, "grad_norm": 1.75, "grad_norm_var": 0.03432184855143229, "learning_rate": 0.0001, "loss": 3.7392, "loss/crossentropy": 1.97664475440979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18118216842412949, "step": 28612 }, { "epoch": 0.57228, "grad_norm": 2.078125, "grad_norm_var": 0.03205540974934896, "learning_rate": 0.0001, "loss": 3.9675, "loss/crossentropy": 1.9696056246757507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18983419984579086, "step": 28614 }, { "epoch": 0.57232, "grad_norm": 1.96875, "grad_norm_var": 0.0686767578125, "learning_rate": 0.0001, "loss": 4.0106, "loss/crossentropy": 2.1281611919403076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22855115681886673, "step": 28616 }, { "epoch": 0.57236, "grad_norm": 1.796875, "grad_norm_var": 0.07779032389322917, "learning_rate": 0.0001, "loss": 3.987, "loss/crossentropy": 1.7333523631095886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1650746390223503, "step": 28618 }, { "epoch": 0.5724, "grad_norm": 2.71875, "grad_norm_var": 0.10811258951822916, "learning_rate": 0.0001, "loss": 4.2372, "loss/crossentropy": 2.021788716316223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19636931270360947, "step": 28620 }, { "epoch": 0.57244, "grad_norm": 1.9921875, "grad_norm_var": 0.1073394775390625, "learning_rate": 0.0001, "loss": 3.8144, "loss/crossentropy": 1.7764569520950317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557444959878922, "step": 28622 }, { "epoch": 0.57248, "grad_norm": 1.8515625, "grad_norm_var": 0.10955301920572917, "learning_rate": 0.0001, "loss": 3.8214, "loss/crossentropy": 1.8160730004310608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741533875465393, "step": 28624 }, { "epoch": 0.57252, "grad_norm": 1.84375, "grad_norm_var": 0.1038225809733073, "learning_rate": 0.0001, "loss": 3.7493, "loss/crossentropy": 1.7295884490013123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17627893388271332, "step": 28626 }, { "epoch": 0.57256, "grad_norm": 1.9140625, "grad_norm_var": 0.08949381510416667, "learning_rate": 0.0001, "loss": 3.9968, "loss/crossentropy": 1.9967339038848877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878109648823738, "step": 28628 }, { "epoch": 0.5726, "grad_norm": 1.8984375, "grad_norm_var": 0.09168294270833334, "learning_rate": 0.0001, "loss": 3.9292, "loss/crossentropy": 2.4338849782943726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211342379450798, "step": 28630 }, { "epoch": 0.57264, "grad_norm": 1.921875, "grad_norm_var": 0.057566070556640626, "learning_rate": 0.0001, "loss": 3.928, "loss/crossentropy": 2.0511388778686523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2051515057682991, "step": 28632 }, { "epoch": 0.57268, "grad_norm": 1.84375, "grad_norm_var": 0.05156021118164063, "learning_rate": 0.0001, "loss": 3.5661, "loss/crossentropy": 1.7425475716590881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18100523948669434, "step": 28634 }, { "epoch": 0.57272, "grad_norm": 2.03125, "grad_norm_var": 0.004343414306640625, "learning_rate": 0.0001, "loss": 4.2029, "loss/crossentropy": 2.0718079805374146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951262131333351, "step": 28636 }, { "epoch": 0.57276, "grad_norm": 2.015625, "grad_norm_var": 0.004514312744140625, "learning_rate": 0.0001, "loss": 4.0589, "loss/crossentropy": 1.9455790519714355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024966925382614, "step": 28638 }, { "epoch": 0.5728, "grad_norm": 1.8515625, "grad_norm_var": 0.006075032552083333, "learning_rate": 0.0001, "loss": 3.8238, "loss/crossentropy": 1.9923955798149109, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17989075928926468, "step": 28640 }, { "epoch": 0.57284, "grad_norm": 1.9296875, "grad_norm_var": 0.005708567301432292, "learning_rate": 0.0001, "loss": 4.0637, "loss/crossentropy": 2.178214907646179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973334327340126, "step": 28642 }, { "epoch": 0.57288, "grad_norm": 1.9375, "grad_norm_var": 0.004349517822265625, "learning_rate": 0.0001, "loss": 3.9935, "loss/crossentropy": 1.881864607334137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17736243456602097, "step": 28644 }, { "epoch": 0.57292, "grad_norm": 1.859375, "grad_norm_var": 0.004988352457682292, "learning_rate": 0.0001, "loss": 4.2611, "loss/crossentropy": 2.140734553337097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056664153933525, "step": 28646 }, { "epoch": 0.57296, "grad_norm": 1.921875, "grad_norm_var": 0.0049479166666666664, "learning_rate": 0.0001, "loss": 4.1077, "loss/crossentropy": 2.2030951976776123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997833326458931, "step": 28648 }, { "epoch": 0.573, "grad_norm": 1.984375, "grad_norm_var": 0.0045888264973958336, "learning_rate": 0.0001, "loss": 4.1237, "loss/crossentropy": 1.8152233958244324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17497697472572327, "step": 28650 }, { "epoch": 0.57304, "grad_norm": 1.9609375, "grad_norm_var": 0.008056386311848959, "learning_rate": 0.0001, "loss": 4.4633, "loss/crossentropy": 2.2969506978988647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22079680860042572, "step": 28652 }, { "epoch": 0.57308, "grad_norm": 1.875, "grad_norm_var": 0.010550689697265626, "learning_rate": 0.0001, "loss": 4.0007, "loss/crossentropy": 2.058579981327057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992625817656517, "step": 28654 }, { "epoch": 0.57312, "grad_norm": 1.9453125, "grad_norm_var": 0.0073964436848958336, "learning_rate": 0.0001, "loss": 3.9863, "loss/crossentropy": 1.9630435109138489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19192881137132645, "step": 28656 }, { "epoch": 0.57316, "grad_norm": 1.953125, "grad_norm_var": 0.008408355712890624, "learning_rate": 0.0001, "loss": 3.9874, "loss/crossentropy": 2.089448630809784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.185981385409832, "step": 28658 }, { "epoch": 0.5732, "grad_norm": 1.890625, "grad_norm_var": 0.0088287353515625, "learning_rate": 0.0001, "loss": 3.8705, "loss/crossentropy": 2.1264119148254395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18827120959758759, "step": 28660 }, { "epoch": 0.57324, "grad_norm": 1.7734375, "grad_norm_var": 0.0121978759765625, "learning_rate": 0.0001, "loss": 3.8047, "loss/crossentropy": 2.121288537979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772882044315338, "step": 28662 }, { "epoch": 0.57328, "grad_norm": 2.484375, "grad_norm_var": 0.03190689086914063, "learning_rate": 0.0001, "loss": 4.1513, "loss/crossentropy": 2.0339037775993347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1962498426437378, "step": 28664 }, { "epoch": 0.57332, "grad_norm": 1.828125, "grad_norm_var": 0.03292210896809896, "learning_rate": 0.0001, "loss": 4.0585, "loss/crossentropy": 2.1422963738441467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18338894098997116, "step": 28666 }, { "epoch": 0.57336, "grad_norm": 2.09375, "grad_norm_var": 0.030590565999348958, "learning_rate": 0.0001, "loss": 4.2466, "loss/crossentropy": 2.029167354106903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20171435922384262, "step": 28668 }, { "epoch": 0.5734, "grad_norm": 2.203125, "grad_norm_var": 0.032225545247395834, "learning_rate": 0.0001, "loss": 3.918, "loss/crossentropy": 1.7628564238548279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19051892310380936, "step": 28670 }, { "epoch": 0.57344, "grad_norm": 1.734375, "grad_norm_var": 0.03588053385416667, "learning_rate": 0.0001, "loss": 3.7368, "loss/crossentropy": 2.0292606949806213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17968103289604187, "step": 28672 }, { "epoch": 0.57348, "grad_norm": 2.125, "grad_norm_var": 0.044130198160807294, "learning_rate": 0.0001, "loss": 4.2414, "loss/crossentropy": 1.9167731404304504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22261708974838257, "step": 28674 }, { "epoch": 0.57352, "grad_norm": 2.03125, "grad_norm_var": 0.04302164713541667, "learning_rate": 0.0001, "loss": 3.9668, "loss/crossentropy": 1.9671185612678528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19196172803640366, "step": 28676 }, { "epoch": 0.57356, "grad_norm": 1.8671875, "grad_norm_var": 0.037666575113932295, "learning_rate": 0.0001, "loss": 3.8129, "loss/crossentropy": 2.0191508531570435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18491507321596146, "step": 28678 }, { "epoch": 0.5736, "grad_norm": 1.859375, "grad_norm_var": 0.023209635416666666, "learning_rate": 0.0001, "loss": 4.0662, "loss/crossentropy": 1.9060558676719666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18243618309497833, "step": 28680 }, { "epoch": 0.57364, "grad_norm": 2.03125, "grad_norm_var": 0.0218658447265625, "learning_rate": 0.0001, "loss": 4.0068, "loss/crossentropy": 1.867979109287262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23452743887901306, "step": 28682 }, { "epoch": 0.57368, "grad_norm": 1.953125, "grad_norm_var": 0.021354166666666667, "learning_rate": 0.0001, "loss": 4.079, "loss/crossentropy": 1.6199312806129456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16022612154483795, "step": 28684 }, { "epoch": 0.57372, "grad_norm": 1.9609375, "grad_norm_var": 0.0176422119140625, "learning_rate": 0.0001, "loss": 4.191, "loss/crossentropy": 2.5148130655288696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2360077053308487, "step": 28686 }, { "epoch": 0.57376, "grad_norm": 2.015625, "grad_norm_var": 0.015126291910807292, "learning_rate": 0.0001, "loss": 4.032, "loss/crossentropy": 1.9117471575737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010490596294403, "step": 28688 }, { "epoch": 0.5738, "grad_norm": 1.796875, "grad_norm_var": 0.006821441650390625, "learning_rate": 0.0001, "loss": 4.0699, "loss/crossentropy": 2.079534113407135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18718824535608292, "step": 28690 }, { "epoch": 0.57384, "grad_norm": 1.9765625, "grad_norm_var": 0.00633544921875, "learning_rate": 0.0001, "loss": 3.97, "loss/crossentropy": 2.187251031398773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929212063550949, "step": 28692 }, { "epoch": 0.57388, "grad_norm": 1.90625, "grad_norm_var": 0.006023915608723959, "learning_rate": 0.0001, "loss": 4.0509, "loss/crossentropy": 1.9140017628669739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726428121328354, "step": 28694 }, { "epoch": 0.57392, "grad_norm": 1.9140625, "grad_norm_var": 0.0047515869140625, "learning_rate": 0.0001, "loss": 3.9645, "loss/crossentropy": 1.530074417591095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15649892389774323, "step": 28696 }, { "epoch": 0.57396, "grad_norm": 2.171875, "grad_norm_var": 0.006998443603515625, "learning_rate": 0.0001, "loss": 4.1736, "loss/crossentropy": 2.2588948011398315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20315733551979065, "step": 28698 }, { "epoch": 0.574, "grad_norm": 2.140625, "grad_norm_var": 0.0094390869140625, "learning_rate": 0.0001, "loss": 4.0446, "loss/crossentropy": 2.1702714562416077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24278688430786133, "step": 28700 }, { "epoch": 0.57404, "grad_norm": 2.046875, "grad_norm_var": 0.009789021809895833, "learning_rate": 0.0001, "loss": 3.9904, "loss/crossentropy": 2.1350772380828857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21515338122844696, "step": 28702 }, { "epoch": 0.57408, "grad_norm": 1.953125, "grad_norm_var": 0.0086822509765625, "learning_rate": 0.0001, "loss": 3.9282, "loss/crossentropy": 1.9809751510620117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557599931955338, "step": 28704 }, { "epoch": 0.57412, "grad_norm": 1.9921875, "grad_norm_var": 0.006658681233723958, "learning_rate": 0.0001, "loss": 3.9628, "loss/crossentropy": 1.7658061981201172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16570701450109482, "step": 28706 }, { "epoch": 0.57416, "grad_norm": 1.7421875, "grad_norm_var": 0.010188547770182292, "learning_rate": 0.0001, "loss": 3.8666, "loss/crossentropy": 1.787250578403473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18032344430685043, "step": 28708 }, { "epoch": 0.5742, "grad_norm": 1.84375, "grad_norm_var": 0.011082967122395834, "learning_rate": 0.0001, "loss": 4.1878, "loss/crossentropy": 2.361757516860962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2135041281580925, "step": 28710 }, { "epoch": 0.57424, "grad_norm": 1.984375, "grad_norm_var": 0.010921223958333334, "learning_rate": 0.0001, "loss": 4.1266, "loss/crossentropy": 2.0687750577926636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20218269526958466, "step": 28712 }, { "epoch": 0.57428, "grad_norm": 2.0625, "grad_norm_var": 0.010587310791015625, "learning_rate": 0.0001, "loss": 3.8437, "loss/crossentropy": 1.816366970539093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1572190448641777, "step": 28714 }, { "epoch": 0.57432, "grad_norm": 1.7265625, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 3.8273, "loss/crossentropy": 2.2949939966201782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18894164264202118, "step": 28716 }, { "epoch": 0.57436, "grad_norm": 1.9296875, "grad_norm_var": 0.008680979410807291, "learning_rate": 0.0001, "loss": 3.9153, "loss/crossentropy": 2.249353289604187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117348968982697, "step": 28718 }, { "epoch": 0.5744, "grad_norm": 1.9765625, "grad_norm_var": 0.008902740478515626, "learning_rate": 0.0001, "loss": 4.2787, "loss/crossentropy": 2.3134714365005493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21170897036790848, "step": 28720 }, { "epoch": 0.57444, "grad_norm": 2.046875, "grad_norm_var": 0.009748331705729167, "learning_rate": 0.0001, "loss": 3.8791, "loss/crossentropy": 2.241877317428589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200472578406334, "step": 28722 }, { "epoch": 0.57448, "grad_norm": 1.96875, "grad_norm_var": 0.007844034830729167, "learning_rate": 0.0001, "loss": 4.2115, "loss/crossentropy": 2.39408540725708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025068998336792, "step": 28724 }, { "epoch": 0.57452, "grad_norm": 1.828125, "grad_norm_var": 0.008063761393229167, "learning_rate": 0.0001, "loss": 3.9963, "loss/crossentropy": 1.9735210537910461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17944949120283127, "step": 28726 }, { "epoch": 0.57456, "grad_norm": 1.8671875, "grad_norm_var": 0.009040323893229167, "learning_rate": 0.0001, "loss": 3.9401, "loss/crossentropy": 2.028856635093689, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19235961139202118, "step": 28728 }, { "epoch": 0.5746, "grad_norm": 1.9609375, "grad_norm_var": 0.006992340087890625, "learning_rate": 0.0001, "loss": 4.3698, "loss/crossentropy": 2.18750536441803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18273723125457764, "step": 28730 }, { "epoch": 0.57464, "grad_norm": 1.9765625, "grad_norm_var": 0.004378255208333333, "learning_rate": 0.0001, "loss": 4.0532, "loss/crossentropy": 2.127722382545471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2066717892885208, "step": 28732 }, { "epoch": 0.57468, "grad_norm": 1.8828125, "grad_norm_var": 0.004255930582682292, "learning_rate": 0.0001, "loss": 3.8228, "loss/crossentropy": 2.1561193466186523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19151438027620316, "step": 28734 }, { "epoch": 0.57472, "grad_norm": 1.890625, "grad_norm_var": 0.004463704427083334, "learning_rate": 0.0001, "loss": 3.9893, "loss/crossentropy": 2.3356776237487793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20846813172101974, "step": 28736 }, { "epoch": 0.57476, "grad_norm": 1.828125, "grad_norm_var": 0.005182902018229167, "learning_rate": 0.0001, "loss": 4.14, "loss/crossentropy": 2.024761378765106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19134287536144257, "step": 28738 }, { "epoch": 0.5748, "grad_norm": 1.890625, "grad_norm_var": 0.006060536702473958, "learning_rate": 0.0001, "loss": 3.8726, "loss/crossentropy": 2.242386519908905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20453084260225296, "step": 28740 }, { "epoch": 0.57484, "grad_norm": 1.90625, "grad_norm_var": 0.005472819010416667, "learning_rate": 0.0001, "loss": 3.9335, "loss/crossentropy": 1.7191591262817383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17990755289793015, "step": 28742 }, { "epoch": 0.57488, "grad_norm": 1.9453125, "grad_norm_var": 0.004571278889973958, "learning_rate": 0.0001, "loss": 3.7335, "loss/crossentropy": 2.1018252968788147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20789441466331482, "step": 28744 }, { "epoch": 0.57492, "grad_norm": 1.9765625, "grad_norm_var": 0.0042803446451822914, "learning_rate": 0.0001, "loss": 3.8906, "loss/crossentropy": 1.9056601524353027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18688670545816422, "step": 28746 }, { "epoch": 0.57496, "grad_norm": 1.8984375, "grad_norm_var": 0.00360107421875, "learning_rate": 0.0001, "loss": 3.8521, "loss/crossentropy": 2.0531609058380127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931629702448845, "step": 28748 }, { "epoch": 0.575, "grad_norm": 1.9140625, "grad_norm_var": 0.003684234619140625, "learning_rate": 0.0001, "loss": 3.9218, "loss/crossentropy": 1.89992356300354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1808946281671524, "step": 28750 }, { "epoch": 0.57504, "grad_norm": 1.9375, "grad_norm_var": 0.003739166259765625, "learning_rate": 0.0001, "loss": 3.8245, "loss/crossentropy": 2.2009544372558594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20267173647880554, "step": 28752 }, { "epoch": 0.57508, "grad_norm": 1.9375, "grad_norm_var": 0.0043108622233072914, "learning_rate": 0.0001, "loss": 4.2622, "loss/crossentropy": 2.009132504463196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20399124920368195, "step": 28754 }, { "epoch": 0.57512, "grad_norm": 1.8046875, "grad_norm_var": 0.004365793863932292, "learning_rate": 0.0001, "loss": 3.7545, "loss/crossentropy": 1.9875890612602234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20523500442504883, "step": 28756 }, { "epoch": 0.57516, "grad_norm": 5.40625, "grad_norm_var": 0.7703570048014323, "learning_rate": 0.0001, "loss": 4.0484, "loss/crossentropy": 1.8446847200393677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16559410095214844, "step": 28758 }, { "epoch": 0.5752, "grad_norm": 2.140625, "grad_norm_var": 0.7632118225097656, "learning_rate": 0.0001, "loss": 4.1006, "loss/crossentropy": 2.147071599960327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19807270914316177, "step": 28760 }, { "epoch": 0.57524, "grad_norm": 1.96875, "grad_norm_var": 0.760791015625, "learning_rate": 0.0001, "loss": 4.1881, "loss/crossentropy": 2.046617865562439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18792100995779037, "step": 28762 }, { "epoch": 0.57528, "grad_norm": 2.0, "grad_norm_var": 0.7528297424316406, "learning_rate": 0.0001, "loss": 4.0231, "loss/crossentropy": 1.6951394081115723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1635662019252777, "step": 28764 }, { "epoch": 0.57532, "grad_norm": 2.046875, "grad_norm_var": 0.7440752665201823, "learning_rate": 0.0001, "loss": 3.8999, "loss/crossentropy": 1.7923210263252258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16349300742149353, "step": 28766 }, { "epoch": 0.57536, "grad_norm": 1.953125, "grad_norm_var": 0.7344011942545573, "learning_rate": 0.0001, "loss": 4.2475, "loss/crossentropy": 2.0918190479278564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20321150124073029, "step": 28768 }, { "epoch": 0.5754, "grad_norm": 1.9375, "grad_norm_var": 0.7387451171875, "learning_rate": 0.0001, "loss": 4.1483, "loss/crossentropy": 1.691568374633789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18507587909698486, "step": 28770 }, { "epoch": 0.57544, "grad_norm": 1.921875, "grad_norm_var": 0.7320757548014323, "learning_rate": 0.0001, "loss": 4.0857, "loss/crossentropy": 2.019860029220581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19651245325803757, "step": 28772 }, { "epoch": 0.57548, "grad_norm": 2.015625, "grad_norm_var": 0.01715672810872396, "learning_rate": 0.0001, "loss": 4.0712, "loss/crossentropy": 1.9527531862258911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19377586245536804, "step": 28774 }, { "epoch": 0.57552, "grad_norm": 2.0625, "grad_norm_var": 0.019486236572265624, "learning_rate": 0.0001, "loss": 4.0374, "loss/crossentropy": 2.105517864227295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1907084435224533, "step": 28776 }, { "epoch": 0.57556, "grad_norm": 1.7890625, "grad_norm_var": 0.014277903238932292, "learning_rate": 0.0001, "loss": 3.88, "loss/crossentropy": 1.804184377193451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17274542152881622, "step": 28778 }, { "epoch": 0.5756, "grad_norm": 1.96875, "grad_norm_var": 0.014102935791015625, "learning_rate": 0.0001, "loss": 4.0615, "loss/crossentropy": 2.3283534049987793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21980226039886475, "step": 28780 }, { "epoch": 0.57564, "grad_norm": 1.9375, "grad_norm_var": 0.010727691650390624, "learning_rate": 0.0001, "loss": 3.8824, "loss/crossentropy": 2.054227828979492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968243345618248, "step": 28782 }, { "epoch": 0.57568, "grad_norm": 1.953125, "grad_norm_var": 0.007614898681640625, "learning_rate": 0.0001, "loss": 3.9686, "loss/crossentropy": 2.2226120233535767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20916803926229477, "step": 28784 }, { "epoch": 0.57572, "grad_norm": 1.9296875, "grad_norm_var": 0.007905832926432292, "learning_rate": 0.0001, "loss": 4.007, "loss/crossentropy": 1.9371869564056396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21197544038295746, "step": 28786 }, { "epoch": 0.57576, "grad_norm": 2.21875, "grad_norm_var": 0.017122395833333335, "learning_rate": 0.0001, "loss": 3.6572, "loss/crossentropy": 1.6595491766929626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1565944254398346, "step": 28788 }, { "epoch": 0.5758, "grad_norm": 1.90625, "grad_norm_var": 0.015276845296223958, "learning_rate": 0.0001, "loss": 3.9956, "loss/crossentropy": 1.7383070588111877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1768334060907364, "step": 28790 }, { "epoch": 0.57584, "grad_norm": 1.859375, "grad_norm_var": 0.015508778889973958, "learning_rate": 0.0001, "loss": 4.0105, "loss/crossentropy": 1.9087260365486145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958516761660576, "step": 28792 }, { "epoch": 0.57588, "grad_norm": 2.03125, "grad_norm_var": 0.014460245768229166, "learning_rate": 0.0001, "loss": 3.9552, "loss/crossentropy": 2.1638959646224976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20037583261728287, "step": 28794 }, { "epoch": 0.57592, "grad_norm": 2.09375, "grad_norm_var": 0.015132649739583334, "learning_rate": 0.0001, "loss": 4.1782, "loss/crossentropy": 2.067569851875305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20235508680343628, "step": 28796 }, { "epoch": 0.57596, "grad_norm": 1.9296875, "grad_norm_var": 0.014891560872395833, "learning_rate": 0.0001, "loss": 4.0691, "loss/crossentropy": 2.377517819404602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21966589242219925, "step": 28798 }, { "epoch": 0.576, "grad_norm": 1.8046875, "grad_norm_var": 0.016556549072265624, "learning_rate": 0.0001, "loss": 3.8553, "loss/crossentropy": 1.8835238814353943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17692726850509644, "step": 28800 }, { "epoch": 0.57604, "grad_norm": 1.84375, "grad_norm_var": 0.017093912760416666, "learning_rate": 0.0001, "loss": 4.0865, "loss/crossentropy": 2.177576720714569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1855052188038826, "step": 28802 }, { "epoch": 0.57608, "grad_norm": 2.03125, "grad_norm_var": 0.007881673177083333, "learning_rate": 0.0001, "loss": 4.09, "loss/crossentropy": 2.2061915397644043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123064547777176, "step": 28804 }, { "epoch": 0.57612, "grad_norm": 1.7421875, "grad_norm_var": 0.010692342122395834, "learning_rate": 0.0001, "loss": 3.9471, "loss/crossentropy": 1.9637818932533264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20487510412931442, "step": 28806 }, { "epoch": 0.57616, "grad_norm": 2.140625, "grad_norm_var": 0.014399973551432292, "learning_rate": 0.0001, "loss": 4.0394, "loss/crossentropy": 2.1481754183769226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974523589015007, "step": 28808 }, { "epoch": 0.5762, "grad_norm": 1.9453125, "grad_norm_var": 0.0139312744140625, "learning_rate": 0.0001, "loss": 3.9125, "loss/crossentropy": 1.907672941684723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18901971727609634, "step": 28810 }, { "epoch": 0.57624, "grad_norm": 2.0, "grad_norm_var": 0.0115875244140625, "learning_rate": 0.0001, "loss": 4.0425, "loss/crossentropy": 2.2241225838661194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882694587111473, "step": 28812 }, { "epoch": 0.57628, "grad_norm": 1.828125, "grad_norm_var": 0.01248779296875, "learning_rate": 0.0001, "loss": 4.1874, "loss/crossentropy": 2.150957465171814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233213186264038, "step": 28814 }, { "epoch": 0.57632, "grad_norm": 1.953125, "grad_norm_var": 0.011494954427083334, "learning_rate": 0.0001, "loss": 4.0973, "loss/crossentropy": 1.9427680373191833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19366063922643661, "step": 28816 }, { "epoch": 0.57636, "grad_norm": 1.9140625, "grad_norm_var": 0.012027740478515625, "learning_rate": 0.0001, "loss": 3.7749, "loss/crossentropy": 1.7898414731025696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18277223408222198, "step": 28818 }, { "epoch": 0.5764, "grad_norm": 2.21875, "grad_norm_var": 0.03846435546875, "learning_rate": 0.0001, "loss": 4.0501, "loss/crossentropy": 2.0117841362953186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17678485065698624, "step": 28820 }, { "epoch": 0.57644, "grad_norm": 1.796875, "grad_norm_var": 0.03804423014322917, "learning_rate": 0.0001, "loss": 3.749, "loss/crossentropy": 2.0270848274230957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17247303575277328, "step": 28822 }, { "epoch": 0.57648, "grad_norm": 1.9296875, "grad_norm_var": 0.0336822509765625, "learning_rate": 0.0001, "loss": 3.9248, "loss/crossentropy": 1.917984962463379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1722751259803772, "step": 28824 }, { "epoch": 0.57652, "grad_norm": 2.03125, "grad_norm_var": 0.03392512003580729, "learning_rate": 0.0001, "loss": 4.1579, "loss/crossentropy": 1.8189431428909302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18306910246610641, "step": 28826 }, { "epoch": 0.57656, "grad_norm": 1.8125, "grad_norm_var": 0.03524144490559896, "learning_rate": 0.0001, "loss": 4.0638, "loss/crossentropy": 1.8207548260688782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1782800853252411, "step": 28828 }, { "epoch": 0.5766, "grad_norm": 1.703125, "grad_norm_var": 0.038266754150390624, "learning_rate": 0.0001, "loss": 4.0089, "loss/crossentropy": 2.1244252920150757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18752482533454895, "step": 28830 }, { "epoch": 0.57664, "grad_norm": 1.890625, "grad_norm_var": 0.038426717122395836, "learning_rate": 0.0001, "loss": 4.1127, "loss/crossentropy": 2.435193181037903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23042550683021545, "step": 28832 }, { "epoch": 0.57668, "grad_norm": 1.828125, "grad_norm_var": 0.03916803995768229, "learning_rate": 0.0001, "loss": 3.6812, "loss/crossentropy": 2.1272794008255005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19795158505439758, "step": 28834 }, { "epoch": 0.57672, "grad_norm": 1.8671875, "grad_norm_var": 0.009814453125, "learning_rate": 0.0001, "loss": 4.08, "loss/crossentropy": 1.7166744470596313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17582352459430695, "step": 28836 }, { "epoch": 0.57676, "grad_norm": 1.7421875, "grad_norm_var": 0.011248524983723958, "learning_rate": 0.0001, "loss": 3.85, "loss/crossentropy": 2.066642463207245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19613336026668549, "step": 28838 }, { "epoch": 0.5768, "grad_norm": 1.7890625, "grad_norm_var": 0.0114166259765625, "learning_rate": 0.0001, "loss": 3.8214, "loss/crossentropy": 1.9082713723182678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.173607736825943, "step": 28840 }, { "epoch": 0.57684, "grad_norm": 2.046875, "grad_norm_var": 0.0119781494140625, "learning_rate": 0.0001, "loss": 3.8438, "loss/crossentropy": 2.0809885263442993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1808517575263977, "step": 28842 }, { "epoch": 0.57688, "grad_norm": 1.921875, "grad_norm_var": 0.011246490478515624, "learning_rate": 0.0001, "loss": 3.7981, "loss/crossentropy": 1.8057249784469604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17345206439495087, "step": 28844 }, { "epoch": 0.57692, "grad_norm": 1.953125, "grad_norm_var": 0.012627919514973959, "learning_rate": 0.0001, "loss": 4.2121, "loss/crossentropy": 2.3364038467407227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2242283821105957, "step": 28846 }, { "epoch": 0.57696, "grad_norm": 2.078125, "grad_norm_var": 0.014387003580729167, "learning_rate": 0.0001, "loss": 4.2341, "loss/crossentropy": 2.256643772125244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20197833329439163, "step": 28848 }, { "epoch": 0.577, "grad_norm": 1.9140625, "grad_norm_var": 0.013270823160807292, "learning_rate": 0.0001, "loss": 4.0897, "loss/crossentropy": 2.2531803846359253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19735650718212128, "step": 28850 }, { "epoch": 0.57704, "grad_norm": 1.953125, "grad_norm_var": 0.011897786458333334, "learning_rate": 0.0001, "loss": 4.1416, "loss/crossentropy": 2.1737340688705444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048204094171524, "step": 28852 }, { "epoch": 0.57708, "grad_norm": 2.03125, "grad_norm_var": 0.0110595703125, "learning_rate": 0.0001, "loss": 4.2294, "loss/crossentropy": 2.2098671197891235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21505644917488098, "step": 28854 }, { "epoch": 0.57712, "grad_norm": 1.875, "grad_norm_var": 0.011149088541666666, "learning_rate": 0.0001, "loss": 3.7808, "loss/crossentropy": 2.0007169246673584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17798644304275513, "step": 28856 }, { "epoch": 0.57716, "grad_norm": 1.8515625, "grad_norm_var": 0.0106597900390625, "learning_rate": 0.0001, "loss": 3.6825, "loss/crossentropy": 1.9002285599708557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18565426766872406, "step": 28858 }, { "epoch": 0.5772, "grad_norm": 1.9375, "grad_norm_var": 0.008833567301432291, "learning_rate": 0.0001, "loss": 3.8333, "loss/crossentropy": 1.9257296323776245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18187644332647324, "step": 28860 }, { "epoch": 0.57724, "grad_norm": 1.8515625, "grad_norm_var": 0.007868448893229166, "learning_rate": 0.0001, "loss": 4.204, "loss/crossentropy": 2.0777525305747986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20168712735176086, "step": 28862 }, { "epoch": 0.57728, "grad_norm": 1.8046875, "grad_norm_var": 0.0074127197265625, "learning_rate": 0.0001, "loss": 3.9328, "loss/crossentropy": 2.1630115509033203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19102369993925095, "step": 28864 }, { "epoch": 0.57732, "grad_norm": 1.8046875, "grad_norm_var": 0.0077301025390625, "learning_rate": 0.0001, "loss": 3.7329, "loss/crossentropy": 1.8166091442108154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1680990755558014, "step": 28866 }, { "epoch": 0.57736, "grad_norm": 1.96875, "grad_norm_var": 0.009419759114583334, "learning_rate": 0.0001, "loss": 4.1253, "loss/crossentropy": 2.073417544364929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20478951185941696, "step": 28868 }, { "epoch": 0.5774, "grad_norm": 1.8203125, "grad_norm_var": 0.007184855143229167, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 1.9039225578308105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17434976249933243, "step": 28870 }, { "epoch": 0.57744, "grad_norm": 2.03125, "grad_norm_var": 0.0077301025390625, "learning_rate": 0.0001, "loss": 4.0646, "loss/crossentropy": 2.3139874935150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20631124079227448, "step": 28872 }, { "epoch": 0.57748, "grad_norm": 1.8828125, "grad_norm_var": 0.007279459635416667, "learning_rate": 0.0001, "loss": 3.8964, "loss/crossentropy": 2.1889963150024414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22081536054611206, "step": 28874 }, { "epoch": 0.57752, "grad_norm": 1.8359375, "grad_norm_var": 0.007283274332682292, "learning_rate": 0.0001, "loss": 3.9964, "loss/crossentropy": 2.1492413878440857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974084973335266, "step": 28876 }, { "epoch": 0.57756, "grad_norm": 1.8125, "grad_norm_var": 0.007201131184895833, "learning_rate": 0.0001, "loss": 3.751, "loss/crossentropy": 1.8974568843841553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1662147268652916, "step": 28878 }, { "epoch": 0.5776, "grad_norm": 1.7578125, "grad_norm_var": 0.008093007405598958, "learning_rate": 0.0001, "loss": 3.8792, "loss/crossentropy": 2.0488428473472595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929314285516739, "step": 28880 }, { "epoch": 0.57764, "grad_norm": 1.8984375, "grad_norm_var": 0.007664998372395833, "learning_rate": 0.0001, "loss": 3.9943, "loss/crossentropy": 1.9983501434326172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19050447642803192, "step": 28882 }, { "epoch": 0.57768, "grad_norm": 1.9453125, "grad_norm_var": 0.0066640218098958336, "learning_rate": 0.0001, "loss": 3.8174, "loss/crossentropy": 1.8651137948036194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18415149301290512, "step": 28884 }, { "epoch": 0.57772, "grad_norm": 1.828125, "grad_norm_var": 0.007055409749348958, "learning_rate": 0.0001, "loss": 3.8707, "loss/crossentropy": 2.0147623419761658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1963946893811226, "step": 28886 }, { "epoch": 0.57776, "grad_norm": 2.046875, "grad_norm_var": 0.23955052693684895, "learning_rate": 0.0001, "loss": 4.1033, "loss/crossentropy": 2.057682752609253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22113247215747833, "step": 28888 }, { "epoch": 0.5778, "grad_norm": 1.875, "grad_norm_var": 0.239501953125, "learning_rate": 0.0001, "loss": 4.2273, "loss/crossentropy": 2.0742294788360596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198052555322647, "step": 28890 }, { "epoch": 0.57784, "grad_norm": 2.0, "grad_norm_var": 0.23716201782226562, "learning_rate": 0.0001, "loss": 4.0652, "loss/crossentropy": 2.1344255208969116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19126100838184357, "step": 28892 }, { "epoch": 0.57788, "grad_norm": 1.84375, "grad_norm_var": 0.23585611979166668, "learning_rate": 0.0001, "loss": 3.8575, "loss/crossentropy": 1.8000599145889282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17296022176742554, "step": 28894 }, { "epoch": 0.57792, "grad_norm": 1.90625, "grad_norm_var": 0.2333941141764323, "learning_rate": 0.0001, "loss": 4.0821, "loss/crossentropy": 2.2380261421203613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18993133306503296, "step": 28896 }, { "epoch": 0.57796, "grad_norm": 1.8984375, "grad_norm_var": 0.23112360636393228, "learning_rate": 0.0001, "loss": 4.1168, "loss/crossentropy": 2.066701650619507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944030448794365, "step": 28898 }, { "epoch": 0.578, "grad_norm": 1.9609375, "grad_norm_var": 0.22492268880208333, "learning_rate": 0.0001, "loss": 4.2338, "loss/crossentropy": 2.2941386699676514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139548882842064, "step": 28900 }, { "epoch": 0.57804, "grad_norm": 1.9609375, "grad_norm_var": 0.22176920572916667, "learning_rate": 0.0001, "loss": 3.8767, "loss/crossentropy": 2.206367015838623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887761801481247, "step": 28902 }, { "epoch": 0.57808, "grad_norm": 1.8828125, "grad_norm_var": 0.004473622639973958, "learning_rate": 0.0001, "loss": 4.1728, "loss/crossentropy": 2.176335871219635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016155868768692, "step": 28904 }, { "epoch": 0.57812, "grad_norm": 2.359375, "grad_norm_var": 0.015022532145182291, "learning_rate": 0.0001, "loss": 4.2275, "loss/crossentropy": 2.2069336771965027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20764954388141632, "step": 28906 }, { "epoch": 0.57816, "grad_norm": 1.828125, "grad_norm_var": 0.0157379150390625, "learning_rate": 0.0001, "loss": 3.9015, "loss/crossentropy": 2.2566596269607544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19805442541837692, "step": 28908 }, { "epoch": 0.5782, "grad_norm": 1.8671875, "grad_norm_var": 0.0152252197265625, "learning_rate": 0.0001, "loss": 3.8767, "loss/crossentropy": 1.8642338514328003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19232972711324692, "step": 28910 }, { "epoch": 0.57824, "grad_norm": 1.9453125, "grad_norm_var": 0.015119425455729167, "learning_rate": 0.0001, "loss": 4.1759, "loss/crossentropy": 1.9054778218269348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1713176593184471, "step": 28912 }, { "epoch": 0.57828, "grad_norm": 1.734375, "grad_norm_var": 0.0199859619140625, "learning_rate": 0.0001, "loss": 3.6261, "loss/crossentropy": 1.7423803210258484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16640456765890121, "step": 28914 }, { "epoch": 0.57832, "grad_norm": 2.03125, "grad_norm_var": 0.02054443359375, "learning_rate": 0.0001, "loss": 3.9985, "loss/crossentropy": 2.229828178882599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20995531231164932, "step": 28916 }, { "epoch": 0.57836, "grad_norm": 2.203125, "grad_norm_var": 0.0248443603515625, "learning_rate": 0.0001, "loss": 3.9723, "loss/crossentropy": 1.483105719089508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15799641609191895, "step": 28918 }, { "epoch": 0.5784, "grad_norm": 1.9140625, "grad_norm_var": 0.024395497639973958, "learning_rate": 0.0001, "loss": 4.2127, "loss/crossentropy": 2.2207179069519043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969587355852127, "step": 28920 }, { "epoch": 0.57844, "grad_norm": 1.9375, "grad_norm_var": 0.012259674072265626, "learning_rate": 0.0001, "loss": 3.9546, "loss/crossentropy": 2.3045194149017334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19103257358074188, "step": 28922 }, { "epoch": 0.57848, "grad_norm": 2.078125, "grad_norm_var": 0.014289347330729167, "learning_rate": 0.0001, "loss": 4.1317, "loss/crossentropy": 2.0717945098876953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20137913525104523, "step": 28924 }, { "epoch": 0.57852, "grad_norm": 2.046875, "grad_norm_var": 0.014926910400390625, "learning_rate": 0.0001, "loss": 4.0922, "loss/crossentropy": 2.2904374599456787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2105882465839386, "step": 28926 }, { "epoch": 0.57856, "grad_norm": 1.7890625, "grad_norm_var": 0.015818023681640626, "learning_rate": 0.0001, "loss": 3.9496, "loss/crossentropy": 2.3007075786590576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22365766763687134, "step": 28928 }, { "epoch": 0.5786, "grad_norm": 1.9765625, "grad_norm_var": 0.012276204427083333, "learning_rate": 0.0001, "loss": 4.082, "loss/crossentropy": 2.0107430815696716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18182440847158432, "step": 28930 }, { "epoch": 0.57864, "grad_norm": 1.7890625, "grad_norm_var": 0.013106028238932291, "learning_rate": 0.0001, "loss": 3.9859, "loss/crossentropy": 2.1066025495529175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19574615359306335, "step": 28932 }, { "epoch": 0.57868, "grad_norm": 1.890625, "grad_norm_var": 0.007995351155598959, "learning_rate": 0.0001, "loss": 3.9903, "loss/crossentropy": 2.1549283266067505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20238593965768814, "step": 28934 }, { "epoch": 0.57872, "grad_norm": 1.796875, "grad_norm_var": 0.010734049479166667, "learning_rate": 0.0001, "loss": 3.683, "loss/crossentropy": 2.0073814392089844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.185798779129982, "step": 28936 }, { "epoch": 0.57876, "grad_norm": 2.0, "grad_norm_var": 0.0111724853515625, "learning_rate": 0.0001, "loss": 3.8882, "loss/crossentropy": 1.7613274455070496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848607361316681, "step": 28938 }, { "epoch": 0.5788, "grad_norm": 1.9140625, "grad_norm_var": 0.08815078735351563, "learning_rate": 0.0001, "loss": 4.0142, "loss/crossentropy": 2.322823405265808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20245341211557388, "step": 28940 }, { "epoch": 0.57884, "grad_norm": 1.9609375, "grad_norm_var": 0.0876129150390625, "learning_rate": 0.0001, "loss": 3.8313, "loss/crossentropy": 1.7385436296463013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17173148691654205, "step": 28942 }, { "epoch": 0.57888, "grad_norm": 1.7421875, "grad_norm_var": 0.08886286417643229, "learning_rate": 0.0001, "loss": 3.9017, "loss/crossentropy": 1.9626798033714294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17217369377613068, "step": 28944 }, { "epoch": 0.57892, "grad_norm": 1.6640625, "grad_norm_var": 0.09463297526041667, "learning_rate": 0.0001, "loss": 3.7427, "loss/crossentropy": 2.4835134744644165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073291689157486, "step": 28946 }, { "epoch": 0.57896, "grad_norm": 1.9609375, "grad_norm_var": 0.09482192993164062, "learning_rate": 0.0001, "loss": 3.9099, "loss/crossentropy": 2.0418132543563843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23464705049991608, "step": 28948 }, { "epoch": 0.579, "grad_norm": 1.9609375, "grad_norm_var": 0.09468968709309895, "learning_rate": 0.0001, "loss": 4.2291, "loss/crossentropy": 2.2382947206497192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21655261516571045, "step": 28950 }, { "epoch": 0.57904, "grad_norm": 1.890625, "grad_norm_var": 0.09024632771809896, "learning_rate": 0.0001, "loss": 4.0013, "loss/crossentropy": 2.0737303495407104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19475168734788895, "step": 28952 }, { "epoch": 0.57908, "grad_norm": 1.7890625, "grad_norm_var": 0.09391988118489583, "learning_rate": 0.0001, "loss": 3.9371, "loss/crossentropy": 2.088689923286438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19861633330583572, "step": 28954 }, { "epoch": 0.57912, "grad_norm": 2.0, "grad_norm_var": 0.011472320556640625, "learning_rate": 0.0001, "loss": 3.6761, "loss/crossentropy": 1.8260209560394287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16850142180919647, "step": 28956 }, { "epoch": 0.57916, "grad_norm": 1.921875, "grad_norm_var": 0.010263824462890625, "learning_rate": 0.0001, "loss": 3.8753, "loss/crossentropy": 1.834153652191162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16283375024795532, "step": 28958 }, { "epoch": 0.5792, "grad_norm": 1.9296875, "grad_norm_var": 0.009569295247395833, "learning_rate": 0.0001, "loss": 3.7873, "loss/crossentropy": 1.9701993465423584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18593734502792358, "step": 28960 }, { "epoch": 0.57924, "grad_norm": 1.8125, "grad_norm_var": 0.006965128580729166, "learning_rate": 0.0001, "loss": 3.7081, "loss/crossentropy": 2.026169538497925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18718570470809937, "step": 28962 }, { "epoch": 0.57928, "grad_norm": 1.859375, "grad_norm_var": 0.006400553385416666, "learning_rate": 0.0001, "loss": 4.1803, "loss/crossentropy": 1.7088202238082886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1926863193511963, "step": 28964 }, { "epoch": 0.57932, "grad_norm": 1.9375, "grad_norm_var": 0.006498209635416667, "learning_rate": 0.0001, "loss": 4.1078, "loss/crossentropy": 2.008699059486389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17876552045345306, "step": 28966 }, { "epoch": 0.57936, "grad_norm": 1.984375, "grad_norm_var": 0.007279459635416667, "learning_rate": 0.0001, "loss": 3.8845, "loss/crossentropy": 1.7667925953865051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16738490760326385, "step": 28968 }, { "epoch": 0.5794, "grad_norm": 1.78125, "grad_norm_var": 0.010239410400390624, "learning_rate": 0.0001, "loss": 4.0384, "loss/crossentropy": 1.7718802094459534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15908128768205643, "step": 28970 }, { "epoch": 0.57944, "grad_norm": 1.8671875, "grad_norm_var": 0.007437896728515625, "learning_rate": 0.0001, "loss": 4.0281, "loss/crossentropy": 2.086379885673523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19513976573944092, "step": 28972 }, { "epoch": 0.57948, "grad_norm": 1.921875, "grad_norm_var": 0.0073486328125, "learning_rate": 0.0001, "loss": 4.0432, "loss/crossentropy": 2.157579243183136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022448629140854, "step": 28974 }, { "epoch": 0.57952, "grad_norm": 1.8203125, "grad_norm_var": 0.007689412434895833, "learning_rate": 0.0001, "loss": 4.0684, "loss/crossentropy": 1.9351604580879211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752564013004303, "step": 28976 }, { "epoch": 0.57956, "grad_norm": 1.765625, "grad_norm_var": 0.0084625244140625, "learning_rate": 0.0001, "loss": 3.9584, "loss/crossentropy": 1.8642477989196777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1806860715150833, "step": 28978 }, { "epoch": 0.5796, "grad_norm": 1.84375, "grad_norm_var": 0.008420562744140625, "learning_rate": 0.0001, "loss": 3.9329, "loss/crossentropy": 2.0846880078315735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18332630395889282, "step": 28980 }, { "epoch": 0.57964, "grad_norm": 1.8125, "grad_norm_var": 0.008595530192057292, "learning_rate": 0.0001, "loss": 3.993, "loss/crossentropy": 2.019991397857666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845393031835556, "step": 28982 }, { "epoch": 0.57968, "grad_norm": 1.9453125, "grad_norm_var": 0.008250935872395834, "learning_rate": 0.0001, "loss": 4.0237, "loss/crossentropy": 2.148271918296814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20690763741731644, "step": 28984 }, { "epoch": 0.57972, "grad_norm": 1.8046875, "grad_norm_var": 0.0036740620930989583, "learning_rate": 0.0001, "loss": 4.1667, "loss/crossentropy": 2.243967056274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21643538773059845, "step": 28986 }, { "epoch": 0.57976, "grad_norm": 1.953125, "grad_norm_var": 0.004388173421223958, "learning_rate": 0.0001, "loss": 4.1014, "loss/crossentropy": 2.094264507293701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19375698268413544, "step": 28988 }, { "epoch": 0.5798, "grad_norm": 1.7578125, "grad_norm_var": 0.005425771077473958, "learning_rate": 0.0001, "loss": 3.6987, "loss/crossentropy": 2.0419931411743164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1727323830127716, "step": 28990 }, { "epoch": 0.57984, "grad_norm": 1.9375, "grad_norm_var": 0.0053708394368489586, "learning_rate": 0.0001, "loss": 3.887, "loss/crossentropy": 2.1932610273361206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193511247634888, "step": 28992 }, { "epoch": 0.57988, "grad_norm": 2.015625, "grad_norm_var": 0.006646474202473958, "learning_rate": 0.0001, "loss": 4.144, "loss/crossentropy": 1.9849725365638733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20237861573696136, "step": 28994 }, { "epoch": 0.57992, "grad_norm": 1.8671875, "grad_norm_var": 0.006420644124348959, "learning_rate": 0.0001, "loss": 3.8113, "loss/crossentropy": 1.801219642162323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1687634363770485, "step": 28996 }, { "epoch": 0.57996, "grad_norm": 1.8515625, "grad_norm_var": 0.005686187744140625, "learning_rate": 0.0001, "loss": 4.0288, "loss/crossentropy": 2.018694579601288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17765481770038605, "step": 28998 }, { "epoch": 0.58, "grad_norm": 1.953125, "grad_norm_var": 0.005598958333333333, "learning_rate": 0.0001, "loss": 4.1113, "loss/crossentropy": 1.8309300541877747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19343653321266174, "step": 29000 }, { "epoch": 0.58004, "grad_norm": 1.9296875, "grad_norm_var": 0.005100250244140625, "learning_rate": 0.0001, "loss": 4.2134, "loss/crossentropy": 2.264810800552368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20135793089866638, "step": 29002 }, { "epoch": 0.58008, "grad_norm": 1.9140625, "grad_norm_var": 0.004903157552083333, "learning_rate": 0.0001, "loss": 3.9869, "loss/crossentropy": 2.0125122666358948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977495700120926, "step": 29004 }, { "epoch": 0.58012, "grad_norm": 1.7890625, "grad_norm_var": 0.0034739176432291665, "learning_rate": 0.0001, "loss": 3.7476, "loss/crossentropy": 1.7581869959831238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17235949635505676, "step": 29006 }, { "epoch": 0.58016, "grad_norm": 1.984375, "grad_norm_var": 0.0043609619140625, "learning_rate": 0.0001, "loss": 3.7818, "loss/crossentropy": 2.0644874572753906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20059636235237122, "step": 29008 }, { "epoch": 0.5802, "grad_norm": 1.8828125, "grad_norm_var": 0.0029296875, "learning_rate": 0.0001, "loss": 4.0232, "loss/crossentropy": 2.2566142082214355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18922609090805054, "step": 29010 }, { "epoch": 0.58024, "grad_norm": 1.9296875, "grad_norm_var": 0.0065996805826822914, "learning_rate": 0.0001, "loss": 4.2414, "loss/crossentropy": 2.3137046098709106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21420790255069733, "step": 29012 }, { "epoch": 0.58028, "grad_norm": 1.9296875, "grad_norm_var": 0.0064280192057291664, "learning_rate": 0.0001, "loss": 3.9192, "loss/crossentropy": 2.1429378986358643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20075450092554092, "step": 29014 }, { "epoch": 0.58032, "grad_norm": 1.8671875, "grad_norm_var": 0.007567342122395833, "learning_rate": 0.0001, "loss": 4.0905, "loss/crossentropy": 2.5844578742980957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21986322849988937, "step": 29016 }, { "epoch": 0.58036, "grad_norm": 1.875, "grad_norm_var": 0.0093902587890625, "learning_rate": 0.0001, "loss": 3.7977, "loss/crossentropy": 1.847869634628296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16430244594812393, "step": 29018 }, { "epoch": 0.5804, "grad_norm": 1.828125, "grad_norm_var": 0.010544586181640624, "learning_rate": 0.0001, "loss": 3.8369, "loss/crossentropy": 1.895844042301178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18856079131364822, "step": 29020 }, { "epoch": 0.58044, "grad_norm": 1.8984375, "grad_norm_var": 0.009113566080729166, "learning_rate": 0.0001, "loss": 3.9936, "loss/crossentropy": 2.1094332933425903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025027722120285, "step": 29022 }, { "epoch": 0.58048, "grad_norm": 2.140625, "grad_norm_var": 0.012172190348307292, "learning_rate": 0.0001, "loss": 4.1463, "loss/crossentropy": 1.820436716079712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872788816690445, "step": 29024 }, { "epoch": 0.58052, "grad_norm": 2.0, "grad_norm_var": 0.011641184488932291, "learning_rate": 0.0001, "loss": 4.1155, "loss/crossentropy": 2.371734142303467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20477186143398285, "step": 29026 }, { "epoch": 0.58056, "grad_norm": 1.9921875, "grad_norm_var": 0.010273996988932292, "learning_rate": 0.0001, "loss": 4.3274, "loss/crossentropy": 2.4618901014328003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20902744680643082, "step": 29028 }, { "epoch": 0.5806, "grad_norm": 1.9296875, "grad_norm_var": 0.010044097900390625, "learning_rate": 0.0001, "loss": 4.0957, "loss/crossentropy": 1.9462957382202148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18320611119270325, "step": 29030 }, { "epoch": 0.58064, "grad_norm": 1.8984375, "grad_norm_var": 0.010589345296223959, "learning_rate": 0.0001, "loss": 3.8647, "loss/crossentropy": 2.07541561126709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19628040492534637, "step": 29032 }, { "epoch": 0.58068, "grad_norm": 1.9609375, "grad_norm_var": 0.009269205729166667, "learning_rate": 0.0001, "loss": 3.9063, "loss/crossentropy": 2.144878387451172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301092207431793, "step": 29034 }, { "epoch": 0.58072, "grad_norm": 1.9296875, "grad_norm_var": 0.009490712483723959, "learning_rate": 0.0001, "loss": 3.8692, "loss/crossentropy": 1.9113212823867798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18449342995882034, "step": 29036 }, { "epoch": 0.58076, "grad_norm": 1.953125, "grad_norm_var": 0.0095611572265625, "learning_rate": 0.0001, "loss": 4.1034, "loss/crossentropy": 2.211828351020813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19539960473775864, "step": 29038 }, { "epoch": 0.5808, "grad_norm": 1.8515625, "grad_norm_var": 0.005179595947265625, "learning_rate": 0.0001, "loss": 4.0377, "loss/crossentropy": 2.1476653814315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979052647948265, "step": 29040 }, { "epoch": 0.58084, "grad_norm": 1.90625, "grad_norm_var": 0.004239654541015625, "learning_rate": 0.0001, "loss": 3.8152, "loss/crossentropy": 1.5791842937469482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17846909165382385, "step": 29042 }, { "epoch": 0.58088, "grad_norm": 1.9609375, "grad_norm_var": 0.002249908447265625, "learning_rate": 0.0001, "loss": 3.7192, "loss/crossentropy": 1.77503103017807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1774892508983612, "step": 29044 }, { "epoch": 0.58092, "grad_norm": 1.9921875, "grad_norm_var": 0.0028523763020833334, "learning_rate": 0.0001, "loss": 4.0933, "loss/crossentropy": 1.9995554089546204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18692871183156967, "step": 29046 }, { "epoch": 0.58096, "grad_norm": 1.859375, "grad_norm_var": 0.002750396728515625, "learning_rate": 0.0001, "loss": 3.9813, "loss/crossentropy": 2.057563364505768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19816753268241882, "step": 29048 }, { "epoch": 0.581, "grad_norm": 1.7265625, "grad_norm_var": 0.010599517822265625, "learning_rate": 0.0001, "loss": 3.7361, "loss/crossentropy": 2.1810996532440186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18617865443229675, "step": 29050 }, { "epoch": 0.58104, "grad_norm": 1.875, "grad_norm_var": 0.010038248697916667, "learning_rate": 0.0001, "loss": 4.0803, "loss/crossentropy": 2.212439775466919, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19946376979351044, "step": 29052 }, { "epoch": 0.58108, "grad_norm": 1.8046875, "grad_norm_var": 0.0109130859375, "learning_rate": 0.0001, "loss": 3.7017, "loss/crossentropy": 1.4078394174575806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14310228824615479, "step": 29054 }, { "epoch": 0.58112, "grad_norm": 1.8984375, "grad_norm_var": 0.0114013671875, "learning_rate": 0.0001, "loss": 3.9539, "loss/crossentropy": 2.0167892575263977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206185445189476, "step": 29056 }, { "epoch": 0.58116, "grad_norm": 1.9375, "grad_norm_var": 0.013312784830729167, "learning_rate": 0.0001, "loss": 4.0975, "loss/crossentropy": 2.0703752040863037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20013286918401718, "step": 29058 }, { "epoch": 0.5812, "grad_norm": 1.7578125, "grad_norm_var": 0.015047200520833333, "learning_rate": 0.0001, "loss": 3.7723, "loss/crossentropy": 1.528024673461914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16279471665620804, "step": 29060 }, { "epoch": 0.58124, "grad_norm": 2.015625, "grad_norm_var": 0.016190338134765624, "learning_rate": 0.0001, "loss": 4.1933, "loss/crossentropy": 2.235237717628479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22028259187936783, "step": 29062 }, { "epoch": 0.58128, "grad_norm": 1.84375, "grad_norm_var": 0.017293294270833332, "learning_rate": 0.0001, "loss": 3.7427, "loss/crossentropy": 1.8778278231620789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16768044978380203, "step": 29064 }, { "epoch": 0.58132, "grad_norm": 1.7265625, "grad_norm_var": 0.011102040608723959, "learning_rate": 0.0001, "loss": 3.5901, "loss/crossentropy": 1.8109752535820007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16292864084243774, "step": 29066 }, { "epoch": 0.58136, "grad_norm": 2.328125, "grad_norm_var": 0.023502604166666666, "learning_rate": 0.0001, "loss": 3.8298, "loss/crossentropy": 1.9164445996284485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17200321704149246, "step": 29068 }, { "epoch": 0.5814, "grad_norm": 2.03125, "grad_norm_var": 0.024103800455729168, "learning_rate": 0.0001, "loss": 4.1952, "loss/crossentropy": 2.1318963766098022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20111309736967087, "step": 29070 }, { "epoch": 0.58144, "grad_norm": 2.078125, "grad_norm_var": 0.023606109619140624, "learning_rate": 0.0001, "loss": 4.1548, "loss/crossentropy": 1.8939481377601624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058345079421997, "step": 29072 }, { "epoch": 0.58148, "grad_norm": 1.96875, "grad_norm_var": 0.0228424072265625, "learning_rate": 0.0001, "loss": 4.1685, "loss/crossentropy": 2.0977996587753296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19700003415346146, "step": 29074 }, { "epoch": 0.58152, "grad_norm": 1.96875, "grad_norm_var": 0.0199615478515625, "learning_rate": 0.0001, "loss": 4.0528, "loss/crossentropy": 2.163521647453308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20974216610193253, "step": 29076 }, { "epoch": 0.58156, "grad_norm": 1.9375, "grad_norm_var": 0.019358062744140626, "learning_rate": 0.0001, "loss": 4.0584, "loss/crossentropy": 2.2900352478027344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20909854024648666, "step": 29078 }, { "epoch": 0.5816, "grad_norm": 1.8671875, "grad_norm_var": 0.018733723958333334, "learning_rate": 0.0001, "loss": 4.0277, "loss/crossentropy": 2.0664632320404053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772863119840622, "step": 29080 }, { "epoch": 0.58164, "grad_norm": 1.765625, "grad_norm_var": 0.018436431884765625, "learning_rate": 0.0001, "loss": 3.7484, "loss/crossentropy": 1.9867297410964966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18980251252651215, "step": 29082 }, { "epoch": 0.58168, "grad_norm": 1.9765625, "grad_norm_var": 0.00830078125, "learning_rate": 0.0001, "loss": 4.1918, "loss/crossentropy": 2.318376064300537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22295164316892624, "step": 29084 }, { "epoch": 0.58172, "grad_norm": 1.8984375, "grad_norm_var": 0.006156412760416666, "learning_rate": 0.0001, "loss": 3.8932, "loss/crossentropy": 2.0880175828933716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18823902308940887, "step": 29086 }, { "epoch": 0.58176, "grad_norm": 2.109375, "grad_norm_var": 0.007039133707682292, "learning_rate": 0.0001, "loss": 4.1964, "loss/crossentropy": 2.5453847646713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24233754724264145, "step": 29088 }, { "epoch": 0.5818, "grad_norm": 1.734375, "grad_norm_var": 0.00999755859375, "learning_rate": 0.0001, "loss": 3.8693, "loss/crossentropy": 2.0354779958724976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2055264338850975, "step": 29090 }, { "epoch": 0.58184, "grad_norm": 1.9609375, "grad_norm_var": 0.009938303629557292, "learning_rate": 0.0001, "loss": 4.1731, "loss/crossentropy": 2.3989592790603638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18171776831150055, "step": 29092 }, { "epoch": 0.58188, "grad_norm": 2.125, "grad_norm_var": 0.013114166259765626, "learning_rate": 0.0001, "loss": 4.0719, "loss/crossentropy": 2.0136520862579346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18895280361175537, "step": 29094 }, { "epoch": 0.58192, "grad_norm": 1.8046875, "grad_norm_var": 0.013071441650390625, "learning_rate": 0.0001, "loss": 3.9095, "loss/crossentropy": 2.057899534702301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17919254302978516, "step": 29096 }, { "epoch": 0.58196, "grad_norm": 1.7890625, "grad_norm_var": 0.013002268473307292, "learning_rate": 0.0001, "loss": 3.8561, "loss/crossentropy": 1.9806170463562012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131671980023384, "step": 29098 }, { "epoch": 0.582, "grad_norm": 1.890625, "grad_norm_var": 0.013734690348307292, "learning_rate": 0.0001, "loss": 3.747, "loss/crossentropy": 2.0952848196029663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2057284563779831, "step": 29100 }, { "epoch": 0.58204, "grad_norm": 2.015625, "grad_norm_var": 0.013346354166666666, "learning_rate": 0.0001, "loss": 4.2041, "loss/crossentropy": 2.2287687063217163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18772760778665543, "step": 29102 }, { "epoch": 0.58208, "grad_norm": 1.984375, "grad_norm_var": 0.012221018473307291, "learning_rate": 0.0001, "loss": 3.9164, "loss/crossentropy": 2.2227126359939575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002912387251854, "step": 29104 }, { "epoch": 0.58212, "grad_norm": 1.8359375, "grad_norm_var": 0.010042317708333333, "learning_rate": 0.0001, "loss": 3.724, "loss/crossentropy": 1.9384364485740662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18695514649152756, "step": 29106 }, { "epoch": 0.58216, "grad_norm": 1.859375, "grad_norm_var": 0.010481516520182291, "learning_rate": 0.0001, "loss": 3.8985, "loss/crossentropy": 1.7653113007545471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19013341516256332, "step": 29108 }, { "epoch": 0.5822, "grad_norm": 1.9296875, "grad_norm_var": 0.008128865559895834, "learning_rate": 0.0001, "loss": 3.8348, "loss/crossentropy": 1.7308897972106934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1701432168483734, "step": 29110 }, { "epoch": 0.58224, "grad_norm": 2.234375, "grad_norm_var": 0.017435709635416668, "learning_rate": 0.0001, "loss": 4.3871, "loss/crossentropy": 2.286720633506775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2024293765425682, "step": 29112 }, { "epoch": 0.58228, "grad_norm": 1.7890625, "grad_norm_var": 0.0161529541015625, "learning_rate": 0.0001, "loss": 3.744, "loss/crossentropy": 1.8618659377098083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18028806149959564, "step": 29114 }, { "epoch": 0.58232, "grad_norm": 1.9921875, "grad_norm_var": 0.0154449462890625, "learning_rate": 0.0001, "loss": 4.0077, "loss/crossentropy": 2.128199815750122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121022716164589, "step": 29116 }, { "epoch": 0.58236, "grad_norm": 1.84375, "grad_norm_var": 0.0156890869140625, "learning_rate": 0.0001, "loss": 3.731, "loss/crossentropy": 2.0003921389579773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1782400906085968, "step": 29118 }, { "epoch": 0.5824, "grad_norm": 1.78125, "grad_norm_var": 0.01651611328125, "learning_rate": 0.0001, "loss": 3.7362, "loss/crossentropy": 1.653384268283844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1676875352859497, "step": 29120 }, { "epoch": 0.58244, "grad_norm": 1.921875, "grad_norm_var": 0.016261545817057292, "learning_rate": 0.0001, "loss": 4.1866, "loss/crossentropy": 2.3602925539016724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22520510107278824, "step": 29122 }, { "epoch": 0.58248, "grad_norm": 1.984375, "grad_norm_var": 0.017814127604166667, "learning_rate": 0.0001, "loss": 4.0382, "loss/crossentropy": 1.839131772518158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19381030648946762, "step": 29124 }, { "epoch": 0.58252, "grad_norm": 2.09375, "grad_norm_var": 0.017878977457682292, "learning_rate": 0.0001, "loss": 4.178, "loss/crossentropy": 2.312969148159027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22048135846853256, "step": 29126 }, { "epoch": 0.58256, "grad_norm": 2.046875, "grad_norm_var": 0.010846964518229167, "learning_rate": 0.0001, "loss": 4.0541, "loss/crossentropy": 2.153268814086914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2186889946460724, "step": 29128 }, { "epoch": 0.5826, "grad_norm": 2.03125, "grad_norm_var": 0.011527252197265626, "learning_rate": 0.0001, "loss": 3.9336, "loss/crossentropy": 2.1008894443511963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893206238746643, "step": 29130 }, { "epoch": 0.58264, "grad_norm": 1.9453125, "grad_norm_var": 0.011331939697265625, "learning_rate": 0.0001, "loss": 4.0611, "loss/crossentropy": 2.0414587259292603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952778548002243, "step": 29132 }, { "epoch": 0.58268, "grad_norm": 2.09375, "grad_norm_var": 0.013085683186848959, "learning_rate": 0.0001, "loss": 4.1025, "loss/crossentropy": 2.440865635871887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21561746299266815, "step": 29134 }, { "epoch": 0.58272, "grad_norm": 1.9296875, "grad_norm_var": 0.012325032552083334, "learning_rate": 0.0001, "loss": 4.1542, "loss/crossentropy": 2.309555768966675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22722266614437103, "step": 29136 }, { "epoch": 0.58276, "grad_norm": 1.8671875, "grad_norm_var": 0.012910715738932292, "learning_rate": 0.0001, "loss": 4.0456, "loss/crossentropy": 2.1777420043945312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994665190577507, "step": 29138 }, { "epoch": 0.5828, "grad_norm": 2.515625, "grad_norm_var": 0.03129781087239583, "learning_rate": 0.0001, "loss": 3.9863, "loss/crossentropy": 2.174704909324646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19741538912057877, "step": 29140 }, { "epoch": 0.58284, "grad_norm": 1.8046875, "grad_norm_var": 0.03214111328125, "learning_rate": 0.0001, "loss": 3.889, "loss/crossentropy": 2.0467293858528137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19651174545288086, "step": 29142 }, { "epoch": 0.58288, "grad_norm": 1.8671875, "grad_norm_var": 0.030418904622395833, "learning_rate": 0.0001, "loss": 4.0606, "loss/crossentropy": 2.050451397895813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18523690849542618, "step": 29144 }, { "epoch": 0.58292, "grad_norm": 1.9609375, "grad_norm_var": 0.027787272135416666, "learning_rate": 0.0001, "loss": 4.1576, "loss/crossentropy": 2.2801162004470825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20639225095510483, "step": 29146 }, { "epoch": 0.58296, "grad_norm": 1.9140625, "grad_norm_var": 0.03065973917643229, "learning_rate": 0.0001, "loss": 3.7002, "loss/crossentropy": 1.7782122492790222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15972331911325455, "step": 29148 }, { "epoch": 0.583, "grad_norm": 2.015625, "grad_norm_var": 0.029813639322916665, "learning_rate": 0.0001, "loss": 3.9282, "loss/crossentropy": 2.1129753589630127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19876645505428314, "step": 29150 }, { "epoch": 0.58304, "grad_norm": 2.078125, "grad_norm_var": 0.029259999593098957, "learning_rate": 0.0001, "loss": 4.0128, "loss/crossentropy": 1.7885400652885437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17748355120420456, "step": 29152 }, { "epoch": 0.58308, "grad_norm": 1.8671875, "grad_norm_var": 0.029117584228515625, "learning_rate": 0.0001, "loss": 3.9049, "loss/crossentropy": 1.9744374752044678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20229245722293854, "step": 29154 }, { "epoch": 0.58312, "grad_norm": 1.7890625, "grad_norm_var": 0.007785797119140625, "learning_rate": 0.0001, "loss": 3.8136, "loss/crossentropy": 1.7132557034492493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17987029254436493, "step": 29156 }, { "epoch": 0.58316, "grad_norm": 1.875, "grad_norm_var": 0.007389068603515625, "learning_rate": 0.0001, "loss": 3.9586, "loss/crossentropy": 2.1243802309036255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19921009242534637, "step": 29158 }, { "epoch": 0.5832, "grad_norm": 1.828125, "grad_norm_var": 0.0078277587890625, "learning_rate": 0.0001, "loss": 3.8449, "loss/crossentropy": 1.9450802206993103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19191914796829224, "step": 29160 }, { "epoch": 0.58324, "grad_norm": 1.875, "grad_norm_var": 0.007372792561848958, "learning_rate": 0.0001, "loss": 3.9835, "loss/crossentropy": 1.936660647392273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18147709965705872, "step": 29162 }, { "epoch": 0.58328, "grad_norm": 2.125, "grad_norm_var": 0.009403228759765625, "learning_rate": 0.0001, "loss": 4.1735, "loss/crossentropy": 2.4342548847198486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22083742171525955, "step": 29164 }, { "epoch": 0.58332, "grad_norm": 1.8984375, "grad_norm_var": 0.0081298828125, "learning_rate": 0.0001, "loss": 3.7471, "loss/crossentropy": 1.8538039922714233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17534375935792923, "step": 29166 }, { "epoch": 0.58336, "grad_norm": 1.8984375, "grad_norm_var": 0.006959788004557292, "learning_rate": 0.0001, "loss": 4.0749, "loss/crossentropy": 2.2351138591766357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20727767050266266, "step": 29168 }, { "epoch": 0.5834, "grad_norm": 1.75, "grad_norm_var": 0.008055623372395833, "learning_rate": 0.0001, "loss": 3.7464, "loss/crossentropy": 1.655074954032898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17760378867387772, "step": 29170 }, { "epoch": 0.58344, "grad_norm": 1.8203125, "grad_norm_var": 0.007902018229166667, "learning_rate": 0.0001, "loss": 3.9026, "loss/crossentropy": 1.7590230703353882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17171693593263626, "step": 29172 }, { "epoch": 0.58348, "grad_norm": 1.9921875, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 3.8981, "loss/crossentropy": 1.7671796083450317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16575640812516212, "step": 29174 }, { "epoch": 0.58352, "grad_norm": 1.8828125, "grad_norm_var": 0.012686920166015626, "learning_rate": 0.0001, "loss": 4.0845, "loss/crossentropy": 1.8754821419715881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17734245210886002, "step": 29176 }, { "epoch": 0.58356, "grad_norm": 1.859375, "grad_norm_var": 0.013014475504557291, "learning_rate": 0.0001, "loss": 3.9861, "loss/crossentropy": 2.152361035346985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24318483471870422, "step": 29178 }, { "epoch": 0.5836, "grad_norm": 1.96875, "grad_norm_var": 0.010158030192057292, "learning_rate": 0.0001, "loss": 4.0301, "loss/crossentropy": 2.091741681098938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20466475933790207, "step": 29180 }, { "epoch": 0.58364, "grad_norm": 1.8046875, "grad_norm_var": 0.011757151285807291, "learning_rate": 0.0001, "loss": 4.003, "loss/crossentropy": 2.493666410446167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20663270354270935, "step": 29182 }, { "epoch": 0.58368, "grad_norm": 1.9765625, "grad_norm_var": 0.012247467041015625, "learning_rate": 0.0001, "loss": 3.9938, "loss/crossentropy": 2.311075210571289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115970104932785, "step": 29184 }, { "epoch": 0.58372, "grad_norm": 1.9453125, "grad_norm_var": 0.010789998372395833, "learning_rate": 0.0001, "loss": 4.0891, "loss/crossentropy": 2.2983113527297974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209111899137497, "step": 29186 }, { "epoch": 0.58376, "grad_norm": 1.9765625, "grad_norm_var": 0.009992472330729167, "learning_rate": 0.0001, "loss": 4.1642, "loss/crossentropy": 2.035028040409088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19093219190835953, "step": 29188 }, { "epoch": 0.5838, "grad_norm": 1.8515625, "grad_norm_var": 0.008333333333333333, "learning_rate": 0.0001, "loss": 3.9368, "loss/crossentropy": 2.1713255047798157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836071014404297, "step": 29190 }, { "epoch": 0.58384, "grad_norm": 1.9296875, "grad_norm_var": 0.005033111572265625, "learning_rate": 0.0001, "loss": 3.9997, "loss/crossentropy": 1.898379623889923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19957400858402252, "step": 29192 }, { "epoch": 0.58388, "grad_norm": 1.90625, "grad_norm_var": 0.004624176025390625, "learning_rate": 0.0001, "loss": 4.1757, "loss/crossentropy": 1.930584728717804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872590333223343, "step": 29194 }, { "epoch": 0.58392, "grad_norm": 1.9453125, "grad_norm_var": 0.0045654296875, "learning_rate": 0.0001, "loss": 3.9956, "loss/crossentropy": 2.2888232469558716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355024188756943, "step": 29196 }, { "epoch": 0.58396, "grad_norm": 1.8203125, "grad_norm_var": 0.005961100260416667, "learning_rate": 0.0001, "loss": 4.0838, "loss/crossentropy": 2.242384433746338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2161145657300949, "step": 29198 }, { "epoch": 0.584, "grad_norm": 1.8125, "grad_norm_var": 0.0058024088541666664, "learning_rate": 0.0001, "loss": 3.9119, "loss/crossentropy": 1.7642216086387634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17429838329553604, "step": 29200 }, { "epoch": 0.58404, "grad_norm": 1.9609375, "grad_norm_var": 0.005785878499348958, "learning_rate": 0.0001, "loss": 4.0763, "loss/crossentropy": 2.2808165550231934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21707262098789215, "step": 29202 }, { "epoch": 0.58408, "grad_norm": 1.9375, "grad_norm_var": 0.008158365885416666, "learning_rate": 0.0001, "loss": 4.2149, "loss/crossentropy": 2.2542625665664673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21408841013908386, "step": 29204 }, { "epoch": 0.58412, "grad_norm": 1.9140625, "grad_norm_var": 0.0079010009765625, "learning_rate": 0.0001, "loss": 3.8883, "loss/crossentropy": 2.084090828895569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18555748462677002, "step": 29206 }, { "epoch": 0.58416, "grad_norm": 2.265625, "grad_norm_var": 0.01376953125, "learning_rate": 0.0001, "loss": 4.1465, "loss/crossentropy": 2.3946259021759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20357777923345566, "step": 29208 }, { "epoch": 0.5842, "grad_norm": 1.9375, "grad_norm_var": 0.014697011311848958, "learning_rate": 0.0001, "loss": 4.1172, "loss/crossentropy": 2.2105261087417603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044535130262375, "step": 29210 }, { "epoch": 0.58424, "grad_norm": 1.9140625, "grad_norm_var": 0.018904368082682293, "learning_rate": 0.0001, "loss": 3.639, "loss/crossentropy": 1.5793652534484863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1763077676296234, "step": 29212 }, { "epoch": 0.58428, "grad_norm": 2.03125, "grad_norm_var": 0.016340128580729165, "learning_rate": 0.0001, "loss": 4.2916, "loss/crossentropy": 2.352652430534363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22064512968063354, "step": 29214 }, { "epoch": 0.58432, "grad_norm": 1.7734375, "grad_norm_var": 0.017374674479166668, "learning_rate": 0.0001, "loss": 3.6412, "loss/crossentropy": 1.9069225192070007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17932289838790894, "step": 29216 }, { "epoch": 0.58436, "grad_norm": 1.890625, "grad_norm_var": 0.017899576822916666, "learning_rate": 0.0001, "loss": 3.8841, "loss/crossentropy": 1.7844374179840088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17104727029800415, "step": 29218 }, { "epoch": 0.5844, "grad_norm": 1.859375, "grad_norm_var": 0.01678441365559896, "learning_rate": 0.0001, "loss": 3.8258, "loss/crossentropy": 1.917112410068512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17463716119527817, "step": 29220 }, { "epoch": 0.58444, "grad_norm": 1.9140625, "grad_norm_var": 0.017103830973307293, "learning_rate": 0.0001, "loss": 4.0424, "loss/crossentropy": 1.9697306156158447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1797391176223755, "step": 29222 }, { "epoch": 0.58448, "grad_norm": 1.8671875, "grad_norm_var": 0.012092081705729167, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 2.1226717829704285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1818629801273346, "step": 29224 }, { "epoch": 0.58452, "grad_norm": 1.90625, "grad_norm_var": 0.010567220052083333, "learning_rate": 0.0001, "loss": 3.9699, "loss/crossentropy": 2.406674861907959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20577572286128998, "step": 29226 }, { "epoch": 0.58456, "grad_norm": 1.9375, "grad_norm_var": 0.008235677083333334, "learning_rate": 0.0001, "loss": 3.915, "loss/crossentropy": 2.14900803565979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19237977266311646, "step": 29228 }, { "epoch": 0.5846, "grad_norm": 1.875, "grad_norm_var": 0.006803385416666667, "learning_rate": 0.0001, "loss": 3.9717, "loss/crossentropy": 1.7613251209259033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15341567248106003, "step": 29230 }, { "epoch": 0.58464, "grad_norm": 1.8671875, "grad_norm_var": 0.0053403218587239586, "learning_rate": 0.0001, "loss": 3.9471, "loss/crossentropy": 2.0507588386535645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979541778564453, "step": 29232 }, { "epoch": 0.58468, "grad_norm": 1.9453125, "grad_norm_var": 0.005985260009765625, "learning_rate": 0.0001, "loss": 3.9946, "loss/crossentropy": 1.9034594297409058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17682257294654846, "step": 29234 }, { "epoch": 0.58472, "grad_norm": 1.90625, "grad_norm_var": 0.0058258056640625, "learning_rate": 0.0001, "loss": 3.9959, "loss/crossentropy": 2.317684054374695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21268346905708313, "step": 29236 }, { "epoch": 0.58476, "grad_norm": 1.9921875, "grad_norm_var": 0.0064656575520833336, "learning_rate": 0.0001, "loss": 4.2448, "loss/crossentropy": 2.15400493144989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20123393833637238, "step": 29238 }, { "epoch": 0.5848, "grad_norm": 1.8125, "grad_norm_var": 0.0051513671875, "learning_rate": 0.0001, "loss": 4.0108, "loss/crossentropy": 1.8610196709632874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16698840260505676, "step": 29240 }, { "epoch": 0.58484, "grad_norm": 2.140625, "grad_norm_var": 0.008213043212890625, "learning_rate": 0.0001, "loss": 3.9196, "loss/crossentropy": 1.9535472989082336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17425117641687393, "step": 29242 }, { "epoch": 0.58488, "grad_norm": 1.875, "grad_norm_var": 0.008185831705729167, "learning_rate": 0.0001, "loss": 3.9185, "loss/crossentropy": 1.966432809829712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19216462224721909, "step": 29244 }, { "epoch": 0.58492, "grad_norm": 1.8203125, "grad_norm_var": 0.009163157145182291, "learning_rate": 0.0001, "loss": 3.7805, "loss/crossentropy": 2.0415098071098328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909058690071106, "step": 29246 }, { "epoch": 0.58496, "grad_norm": 1.875, "grad_norm_var": 0.010025787353515624, "learning_rate": 0.0001, "loss": 4.0979, "loss/crossentropy": 2.1297216415405273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19929852336645126, "step": 29248 }, { "epoch": 0.585, "grad_norm": 1.8828125, "grad_norm_var": 0.009417470296223958, "learning_rate": 0.0001, "loss": 4.0817, "loss/crossentropy": 2.1042102575302124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20568375289440155, "step": 29250 }, { "epoch": 0.58504, "grad_norm": 1.9375, "grad_norm_var": 0.009403483072916666, "learning_rate": 0.0001, "loss": 3.8923, "loss/crossentropy": 2.1890978813171387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029515951871872, "step": 29252 }, { "epoch": 0.58508, "grad_norm": 2.0625, "grad_norm_var": 0.009261067708333333, "learning_rate": 0.0001, "loss": 3.989, "loss/crossentropy": 2.061577618122101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19042493402957916, "step": 29254 }, { "epoch": 0.58512, "grad_norm": 2.109375, "grad_norm_var": 0.011366526285807291, "learning_rate": 0.0001, "loss": 4.0084, "loss/crossentropy": 2.0101218819618225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19894610345363617, "step": 29256 }, { "epoch": 0.58516, "grad_norm": 2.03125, "grad_norm_var": 0.008332316080729167, "learning_rate": 0.0001, "loss": 4.0886, "loss/crossentropy": 1.7634521126747131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18951942026615143, "step": 29258 }, { "epoch": 0.5852, "grad_norm": 1.921875, "grad_norm_var": 0.06482518513997396, "learning_rate": 0.0001, "loss": 3.9788, "loss/crossentropy": 2.1544201374053955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19801289588212967, "step": 29260 }, { "epoch": 0.58524, "grad_norm": 2.03125, "grad_norm_var": 0.062154134114583336, "learning_rate": 0.0001, "loss": 3.9742, "loss/crossentropy": 2.2371232509613037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19622696936130524, "step": 29262 }, { "epoch": 0.58528, "grad_norm": 1.9140625, "grad_norm_var": 0.06226781209309896, "learning_rate": 0.0001, "loss": 3.8049, "loss/crossentropy": 1.9571356773376465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19151850044727325, "step": 29264 }, { "epoch": 0.58532, "grad_norm": 1.8984375, "grad_norm_var": 0.06256103515625, "learning_rate": 0.0001, "loss": 4.1047, "loss/crossentropy": 2.149628520011902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19668041914701462, "step": 29266 }, { "epoch": 0.58536, "grad_norm": 1.921875, "grad_norm_var": 0.06358006795247396, "learning_rate": 0.0001, "loss": 4.186, "loss/crossentropy": 2.01633083820343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19449809193611145, "step": 29268 }, { "epoch": 0.5854, "grad_norm": 1.96875, "grad_norm_var": 0.06281636555989584, "learning_rate": 0.0001, "loss": 4.0058, "loss/crossentropy": 2.144730567932129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20602761209011078, "step": 29270 }, { "epoch": 0.58544, "grad_norm": 1.828125, "grad_norm_var": 0.06256917317708334, "learning_rate": 0.0001, "loss": 3.9861, "loss/crossentropy": 1.7255420684814453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16974858194589615, "step": 29272 }, { "epoch": 0.58548, "grad_norm": 1.7890625, "grad_norm_var": 0.06447931925455729, "learning_rate": 0.0001, "loss": 4.1475, "loss/crossentropy": 2.3286606073379517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005838081240654, "step": 29274 }, { "epoch": 0.58552, "grad_norm": 1.953125, "grad_norm_var": 0.0079498291015625, "learning_rate": 0.0001, "loss": 4.1118, "loss/crossentropy": 2.085566818714142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19351552426815033, "step": 29276 }, { "epoch": 0.58556, "grad_norm": 1.7421875, "grad_norm_var": 0.010188547770182292, "learning_rate": 0.0001, "loss": 3.8693, "loss/crossentropy": 1.9797600507736206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020813450217247, "step": 29278 }, { "epoch": 0.5856, "grad_norm": 2.0, "grad_norm_var": 0.011328125, "learning_rate": 0.0001, "loss": 4.0625, "loss/crossentropy": 2.2091132402420044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1922968104481697, "step": 29280 }, { "epoch": 0.58564, "grad_norm": 1.9609375, "grad_norm_var": 0.011665852864583333, "learning_rate": 0.0001, "loss": 4.0093, "loss/crossentropy": 2.220807909965515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21127308905124664, "step": 29282 }, { "epoch": 0.58568, "grad_norm": 2.109375, "grad_norm_var": 0.013206990559895833, "learning_rate": 0.0001, "loss": 3.9675, "loss/crossentropy": 1.9829080700874329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18882058560848236, "step": 29284 }, { "epoch": 0.58572, "grad_norm": 1.96875, "grad_norm_var": 0.011864217122395833, "learning_rate": 0.0001, "loss": 4.1373, "loss/crossentropy": 1.9328150153160095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1966244801878929, "step": 29286 }, { "epoch": 0.58576, "grad_norm": 1.984375, "grad_norm_var": 0.0115631103515625, "learning_rate": 0.0001, "loss": 4.0272, "loss/crossentropy": 2.109495759010315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19433388113975525, "step": 29288 }, { "epoch": 0.5858, "grad_norm": 1.890625, "grad_norm_var": 0.009913889567057292, "learning_rate": 0.0001, "loss": 4.1732, "loss/crossentropy": 2.3193604946136475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086661458015442, "step": 29290 }, { "epoch": 0.58584, "grad_norm": 1.8671875, "grad_norm_var": 0.010422515869140624, "learning_rate": 0.0001, "loss": 3.9131, "loss/crossentropy": 2.015924036502838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1846477910876274, "step": 29292 }, { "epoch": 0.58588, "grad_norm": 2.015625, "grad_norm_var": 0.007323201497395833, "learning_rate": 0.0001, "loss": 4.1275, "loss/crossentropy": 2.2601329684257507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19521460682153702, "step": 29294 }, { "epoch": 0.58592, "grad_norm": 1.96875, "grad_norm_var": 0.005890909830729167, "learning_rate": 0.0001, "loss": 3.7702, "loss/crossentropy": 1.8056439757347107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19355197250843048, "step": 29296 }, { "epoch": 0.58596, "grad_norm": 2.046875, "grad_norm_var": 0.005564117431640625, "learning_rate": 0.0001, "loss": 4.2989, "loss/crossentropy": 2.071826934814453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19889701902866364, "step": 29298 }, { "epoch": 0.586, "grad_norm": 1.9609375, "grad_norm_var": 0.0033111572265625, "learning_rate": 0.0001, "loss": 4.0176, "loss/crossentropy": 2.2320820093154907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21529612690210342, "step": 29300 }, { "epoch": 0.58604, "grad_norm": 2.015625, "grad_norm_var": 0.0034912109375, "learning_rate": 0.0001, "loss": 4.2805, "loss/crossentropy": 2.1989121437072754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20347003638744354, "step": 29302 }, { "epoch": 0.58608, "grad_norm": 1.859375, "grad_norm_var": 0.0037679036458333334, "learning_rate": 0.0001, "loss": 3.8216, "loss/crossentropy": 2.128119111061096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872687190771103, "step": 29304 }, { "epoch": 0.58612, "grad_norm": 1.8515625, "grad_norm_var": 0.010396067301432292, "learning_rate": 0.0001, "loss": 4.0913, "loss/crossentropy": 1.9635827541351318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851392686367035, "step": 29306 }, { "epoch": 0.58616, "grad_norm": 2.1875, "grad_norm_var": 0.0121002197265625, "learning_rate": 0.0001, "loss": 4.2218, "loss/crossentropy": 2.2543106079101562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21720967441797256, "step": 29308 }, { "epoch": 0.5862, "grad_norm": 2.0, "grad_norm_var": 0.013387044270833334, "learning_rate": 0.0001, "loss": 3.8456, "loss/crossentropy": 2.3444888591766357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140304371714592, "step": 29310 }, { "epoch": 0.58624, "grad_norm": 1.984375, "grad_norm_var": 0.013182576497395833, "learning_rate": 0.0001, "loss": 4.1098, "loss/crossentropy": 2.0403956174850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20437155663967133, "step": 29312 }, { "epoch": 0.58628, "grad_norm": 1.7890625, "grad_norm_var": 0.01605199178059896, "learning_rate": 0.0001, "loss": 4.0612, "loss/crossentropy": 1.9070860743522644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18735133856534958, "step": 29314 }, { "epoch": 0.58632, "grad_norm": 1.875, "grad_norm_var": 0.018308258056640624, "learning_rate": 0.0001, "loss": 3.917, "loss/crossentropy": 1.9588303565979004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16786205023527145, "step": 29316 }, { "epoch": 0.58636, "grad_norm": 1.7578125, "grad_norm_var": 0.0215240478515625, "learning_rate": 0.0001, "loss": 3.8377, "loss/crossentropy": 1.8938056230545044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18695908039808273, "step": 29318 }, { "epoch": 0.5864, "grad_norm": 1.78125, "grad_norm_var": 0.022867584228515626, "learning_rate": 0.0001, "loss": 3.7435, "loss/crossentropy": 2.254639148712158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976592808961868, "step": 29320 }, { "epoch": 0.58644, "grad_norm": 1.8828125, "grad_norm_var": 0.01688232421875, "learning_rate": 0.0001, "loss": 3.7988, "loss/crossentropy": 2.2075421810150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18852433562278748, "step": 29322 }, { "epoch": 0.58648, "grad_norm": 2.03125, "grad_norm_var": 0.010860188802083334, "learning_rate": 0.0001, "loss": 3.9432, "loss/crossentropy": 1.6794416904449463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18429157137870789, "step": 29324 }, { "epoch": 0.58652, "grad_norm": 1.8515625, "grad_norm_var": 0.010111236572265625, "learning_rate": 0.0001, "loss": 4.0925, "loss/crossentropy": 2.3620080947875977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21314998716115952, "step": 29326 }, { "epoch": 0.58656, "grad_norm": 1.9453125, "grad_norm_var": 0.0092681884765625, "learning_rate": 0.0001, "loss": 4.1378, "loss/crossentropy": 2.3381006717681885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20351716130971909, "step": 29328 }, { "epoch": 0.5866, "grad_norm": 1.96875, "grad_norm_var": 0.009765370686848959, "learning_rate": 0.0001, "loss": 3.9563, "loss/crossentropy": 2.0491183400154114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19619758427143097, "step": 29330 }, { "epoch": 0.58664, "grad_norm": 1.890625, "grad_norm_var": 0.009855143229166667, "learning_rate": 0.0001, "loss": 3.8267, "loss/crossentropy": 1.7091345191001892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15915849804878235, "step": 29332 }, { "epoch": 0.58668, "grad_norm": 1.9375, "grad_norm_var": 0.005589803059895833, "learning_rate": 0.0001, "loss": 4.0168, "loss/crossentropy": 2.1986491680145264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997922584414482, "step": 29334 }, { "epoch": 0.58672, "grad_norm": 2.015625, "grad_norm_var": 0.006086985270182292, "learning_rate": 0.0001, "loss": 3.8094, "loss/crossentropy": 2.051343023777008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19083235412836075, "step": 29336 }, { "epoch": 0.58676, "grad_norm": 2.0625, "grad_norm_var": 0.008845774332682292, "learning_rate": 0.0001, "loss": 3.7667, "loss/crossentropy": 2.0580617785453796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19386407732963562, "step": 29338 }, { "epoch": 0.5868, "grad_norm": 1.796875, "grad_norm_var": 0.007983144124348958, "learning_rate": 0.0001, "loss": 3.8437, "loss/crossentropy": 1.988048791885376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16248874366283417, "step": 29340 }, { "epoch": 0.58684, "grad_norm": 2.359375, "grad_norm_var": 0.022215779622395834, "learning_rate": 0.0001, "loss": 3.7604, "loss/crossentropy": 1.9649037718772888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17722021788358688, "step": 29342 }, { "epoch": 0.58688, "grad_norm": 1.9140625, "grad_norm_var": 0.021801503499348958, "learning_rate": 0.0001, "loss": 4.1983, "loss/crossentropy": 2.014201283454895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19323624670505524, "step": 29344 }, { "epoch": 0.58692, "grad_norm": 1.8828125, "grad_norm_var": 0.021170806884765626, "learning_rate": 0.0001, "loss": 3.8541, "loss/crossentropy": 1.9666126370429993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17525266110897064, "step": 29346 }, { "epoch": 0.58696, "grad_norm": 1.8359375, "grad_norm_var": 0.021923573811848958, "learning_rate": 0.0001, "loss": 3.7258, "loss/crossentropy": 2.10392427444458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954725682735443, "step": 29348 }, { "epoch": 0.587, "grad_norm": 2.140625, "grad_norm_var": 0.0308013916015625, "learning_rate": 0.0001, "loss": 4.0668, "loss/crossentropy": 1.6613619327545166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21818602830171585, "step": 29350 }, { "epoch": 0.58704, "grad_norm": 1.875, "grad_norm_var": 0.030460611979166666, "learning_rate": 0.0001, "loss": 4.0683, "loss/crossentropy": 2.4305994510650635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147374525666237, "step": 29352 }, { "epoch": 0.58708, "grad_norm": 2.0625, "grad_norm_var": 0.026944986979166665, "learning_rate": 0.0001, "loss": 4.2551, "loss/crossentropy": 2.1946258544921875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22057125717401505, "step": 29354 }, { "epoch": 0.58712, "grad_norm": 1.9765625, "grad_norm_var": 0.025852203369140625, "learning_rate": 0.0001, "loss": 4.1854, "loss/crossentropy": 2.3252480030059814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20392987877130508, "step": 29356 }, { "epoch": 0.58716, "grad_norm": 1.8359375, "grad_norm_var": 0.015999348958333333, "learning_rate": 0.0001, "loss": 3.9164, "loss/crossentropy": 1.9041990041732788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1759115606546402, "step": 29358 }, { "epoch": 0.5872, "grad_norm": 1.8671875, "grad_norm_var": 0.0171783447265625, "learning_rate": 0.0001, "loss": 3.9968, "loss/crossentropy": 1.9955511689186096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914498507976532, "step": 29360 }, { "epoch": 0.58724, "grad_norm": 2.0, "grad_norm_var": 0.0184814453125, "learning_rate": 0.0001, "loss": 3.7173, "loss/crossentropy": 1.9557825922966003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19295810908079147, "step": 29362 }, { "epoch": 0.58728, "grad_norm": 1.8046875, "grad_norm_var": 0.01850153605143229, "learning_rate": 0.0001, "loss": 3.6251, "loss/crossentropy": 1.4499501585960388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.143144428730011, "step": 29364 }, { "epoch": 0.58732, "grad_norm": 1.765625, "grad_norm_var": 0.010536448160807291, "learning_rate": 0.0001, "loss": 3.8048, "loss/crossentropy": 2.1275508999824524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19290220737457275, "step": 29366 }, { "epoch": 0.58736, "grad_norm": 1.8046875, "grad_norm_var": 0.011177317301432291, "learning_rate": 0.0001, "loss": 3.7983, "loss/crossentropy": 2.320215940475464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19288549572229385, "step": 29368 }, { "epoch": 0.5874, "grad_norm": 1.953125, "grad_norm_var": 0.009248606363932292, "learning_rate": 0.0001, "loss": 4.0161, "loss/crossentropy": 1.9968576431274414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18840061873197556, "step": 29370 }, { "epoch": 0.58744, "grad_norm": 1.953125, "grad_norm_var": 0.005671946207682291, "learning_rate": 0.0001, "loss": 4.2287, "loss/crossentropy": 2.2592599391937256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21203969419002533, "step": 29372 }, { "epoch": 0.58748, "grad_norm": 2.0, "grad_norm_var": 0.007051595052083333, "learning_rate": 0.0001, "loss": 4.1003, "loss/crossentropy": 1.9274957180023193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17894938588142395, "step": 29374 }, { "epoch": 0.58752, "grad_norm": 1.890625, "grad_norm_var": 0.006982421875, "learning_rate": 0.0001, "loss": 3.7913, "loss/crossentropy": 2.252523422241211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21168971061706543, "step": 29376 }, { "epoch": 0.58756, "grad_norm": 1.90625, "grad_norm_var": 0.010188802083333334, "learning_rate": 0.0001, "loss": 3.95, "loss/crossentropy": 2.110221028327942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013455107808113, "step": 29378 }, { "epoch": 0.5876, "grad_norm": 1.8671875, "grad_norm_var": 0.008750152587890626, "learning_rate": 0.0001, "loss": 3.8604, "loss/crossentropy": 1.9283993244171143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17572121322155, "step": 29380 }, { "epoch": 0.58764, "grad_norm": 1.8125, "grad_norm_var": 0.007950592041015624, "learning_rate": 0.0001, "loss": 3.988, "loss/crossentropy": 2.23159658908844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21318458020687103, "step": 29382 }, { "epoch": 0.58768, "grad_norm": 1.8515625, "grad_norm_var": 0.0097076416015625, "learning_rate": 0.0001, "loss": 3.9589, "loss/crossentropy": 2.2170876264572144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22383015602827072, "step": 29384 }, { "epoch": 0.58772, "grad_norm": 1.8671875, "grad_norm_var": 0.011173248291015625, "learning_rate": 0.0001, "loss": 4.0071, "loss/crossentropy": 2.2335199117660522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19595736265182495, "step": 29386 }, { "epoch": 0.58776, "grad_norm": 2.015625, "grad_norm_var": 0.0118072509765625, "learning_rate": 0.0001, "loss": 3.9657, "loss/crossentropy": 2.165065050125122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20269524306058884, "step": 29388 }, { "epoch": 0.5878, "grad_norm": 1.8125, "grad_norm_var": 0.012400054931640625, "learning_rate": 0.0001, "loss": 3.5373, "loss/crossentropy": 1.8111371397972107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17589127272367477, "step": 29390 }, { "epoch": 0.58784, "grad_norm": 1.78125, "grad_norm_var": 0.015110015869140625, "learning_rate": 0.0001, "loss": 3.8698, "loss/crossentropy": 2.083921492099762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19300700724124908, "step": 29392 }, { "epoch": 0.58788, "grad_norm": 1.9453125, "grad_norm_var": 0.010929361979166666, "learning_rate": 0.0001, "loss": 4.168, "loss/crossentropy": 1.916080355644226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18377907574176788, "step": 29394 }, { "epoch": 0.58792, "grad_norm": 1.8828125, "grad_norm_var": 0.011842600504557292, "learning_rate": 0.0001, "loss": 4.2627, "loss/crossentropy": 2.2706268429756165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2135249674320221, "step": 29396 }, { "epoch": 0.58796, "grad_norm": 1.8515625, "grad_norm_var": 0.0125396728515625, "learning_rate": 0.0001, "loss": 3.649, "loss/crossentropy": 1.99881511926651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20278233289718628, "step": 29398 }, { "epoch": 0.588, "grad_norm": 1.8203125, "grad_norm_var": 0.009087880452473959, "learning_rate": 0.0001, "loss": 3.9936, "loss/crossentropy": 1.9932519793510437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18421290814876556, "step": 29400 }, { "epoch": 0.58804, "grad_norm": 1.8125, "grad_norm_var": 0.008893839518229167, "learning_rate": 0.0001, "loss": 4.0858, "loss/crossentropy": 2.287453770637512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20635423064231873, "step": 29402 }, { "epoch": 0.58808, "grad_norm": 1.9921875, "grad_norm_var": 0.008544921875, "learning_rate": 0.0001, "loss": 4.3109, "loss/crossentropy": 2.2095491886138916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934846043586731, "step": 29404 }, { "epoch": 0.58812, "grad_norm": 1.8359375, "grad_norm_var": 0.0108795166015625, "learning_rate": 0.0001, "loss": 4.1356, "loss/crossentropy": 2.0484838485717773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19288712739944458, "step": 29406 }, { "epoch": 0.58816, "grad_norm": 2.0, "grad_norm_var": 0.009110514322916667, "learning_rate": 0.0001, "loss": 3.9269, "loss/crossentropy": 2.431947708129883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21592531353235245, "step": 29408 }, { "epoch": 0.5882, "grad_norm": 1.8671875, "grad_norm_var": 0.009049479166666667, "learning_rate": 0.0001, "loss": 4.0748, "loss/crossentropy": 2.0007177591323853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799403429031372, "step": 29410 }, { "epoch": 0.58824, "grad_norm": 2.03125, "grad_norm_var": 0.009224446614583333, "learning_rate": 0.0001, "loss": 4.1588, "loss/crossentropy": 2.168426990509033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2031354159116745, "step": 29412 }, { "epoch": 0.58828, "grad_norm": 1.9296875, "grad_norm_var": 0.007917277018229167, "learning_rate": 0.0001, "loss": 4.0162, "loss/crossentropy": 2.1504631638526917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19324814528226852, "step": 29414 }, { "epoch": 0.58832, "grad_norm": 1.9375, "grad_norm_var": 0.011515299479166666, "learning_rate": 0.0001, "loss": 3.9259, "loss/crossentropy": 2.013205111026764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18360282480716705, "step": 29416 }, { "epoch": 0.58836, "grad_norm": 1.8828125, "grad_norm_var": 0.010776519775390625, "learning_rate": 0.0001, "loss": 4.0384, "loss/crossentropy": 2.155342757701874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049562856554985, "step": 29418 }, { "epoch": 0.5884, "grad_norm": 1.921875, "grad_norm_var": 0.012041982014973958, "learning_rate": 0.0001, "loss": 4.0741, "loss/crossentropy": 2.117911696434021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18994345515966415, "step": 29420 }, { "epoch": 0.58844, "grad_norm": 1.9140625, "grad_norm_var": 0.009422810872395833, "learning_rate": 0.0001, "loss": 4.0517, "loss/crossentropy": 1.980055332183838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1862584576010704, "step": 29422 }, { "epoch": 0.58848, "grad_norm": 2.078125, "grad_norm_var": 0.009364573160807292, "learning_rate": 0.0001, "loss": 4.1284, "loss/crossentropy": 2.2244694232940674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21067889779806137, "step": 29424 }, { "epoch": 0.58852, "grad_norm": 1.9453125, "grad_norm_var": 0.008876291910807292, "learning_rate": 0.0001, "loss": 4.1993, "loss/crossentropy": 2.213523805141449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20092912763357162, "step": 29426 }, { "epoch": 0.58856, "grad_norm": 2.015625, "grad_norm_var": 0.0081695556640625, "learning_rate": 0.0001, "loss": 4.1802, "loss/crossentropy": 1.9292373657226562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17156004905700684, "step": 29428 }, { "epoch": 0.5886, "grad_norm": 1.796875, "grad_norm_var": 0.011537424723307292, "learning_rate": 0.0001, "loss": 4.0741, "loss/crossentropy": 2.4728565216064453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22577238827943802, "step": 29430 }, { "epoch": 0.58864, "grad_norm": 1.78125, "grad_norm_var": 0.011639149983723958, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 1.757906973361969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19193939119577408, "step": 29432 }, { "epoch": 0.58868, "grad_norm": 2.03125, "grad_norm_var": 0.011351521809895833, "learning_rate": 0.0001, "loss": 4.019, "loss/crossentropy": 2.120426893234253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19062675535678864, "step": 29434 }, { "epoch": 0.58872, "grad_norm": 2.140625, "grad_norm_var": 0.01217041015625, "learning_rate": 0.0001, "loss": 3.8409, "loss/crossentropy": 1.7993061542510986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1687781661748886, "step": 29436 }, { "epoch": 0.58876, "grad_norm": 1.90625, "grad_norm_var": 0.01608250935872396, "learning_rate": 0.0001, "loss": 3.775, "loss/crossentropy": 1.995628297328949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18463349342346191, "step": 29438 }, { "epoch": 0.5888, "grad_norm": 1.90625, "grad_norm_var": 0.015329742431640625, "learning_rate": 0.0001, "loss": 3.8311, "loss/crossentropy": 2.411523938179016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045421302318573, "step": 29440 }, { "epoch": 0.58884, "grad_norm": 1.8046875, "grad_norm_var": 0.017288970947265624, "learning_rate": 0.0001, "loss": 3.7145, "loss/crossentropy": 1.9225006103515625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17880843579769135, "step": 29442 }, { "epoch": 0.58888, "grad_norm": 1.8203125, "grad_norm_var": 0.017594401041666666, "learning_rate": 0.0001, "loss": 3.9736, "loss/crossentropy": 2.1082658171653748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19336317479610443, "step": 29444 }, { "epoch": 0.58892, "grad_norm": 1.921875, "grad_norm_var": 0.013362630208333334, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 2.350805640220642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19736752659082413, "step": 29446 }, { "epoch": 0.58896, "grad_norm": 1.875, "grad_norm_var": 0.009712727864583333, "learning_rate": 0.0001, "loss": 3.9396, "loss/crossentropy": 2.1030595302581787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1941584125161171, "step": 29448 }, { "epoch": 0.589, "grad_norm": 1.765625, "grad_norm_var": 0.009700266520182292, "learning_rate": 0.0001, "loss": 3.6338, "loss/crossentropy": 2.173567295074463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19058758020401, "step": 29450 }, { "epoch": 0.58904, "grad_norm": 1.8984375, "grad_norm_var": 0.0044514973958333336, "learning_rate": 0.0001, "loss": 4.1794, "loss/crossentropy": 2.113981068134308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183458924293518, "step": 29452 }, { "epoch": 0.58908, "grad_norm": 1.6796875, "grad_norm_var": 0.0050537109375, "learning_rate": 0.0001, "loss": 3.8333, "loss/crossentropy": 1.6564378142356873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16441450640559196, "step": 29454 }, { "epoch": 0.58912, "grad_norm": 1.671875, "grad_norm_var": 0.007103474934895834, "learning_rate": 0.0001, "loss": 3.591, "loss/crossentropy": 1.7756662368774414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805124208331108, "step": 29456 }, { "epoch": 0.58916, "grad_norm": 1.828125, "grad_norm_var": 0.006990305582682292, "learning_rate": 0.0001, "loss": 3.8145, "loss/crossentropy": 1.825185477733612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1628718078136444, "step": 29458 }, { "epoch": 0.5892, "grad_norm": 1.875, "grad_norm_var": 0.006001790364583333, "learning_rate": 0.0001, "loss": 3.9426, "loss/crossentropy": 2.0170122385025024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19748982787132263, "step": 29460 }, { "epoch": 0.58924, "grad_norm": 1.828125, "grad_norm_var": 0.006783040364583334, "learning_rate": 0.0001, "loss": 3.8533, "loss/crossentropy": 1.8711951971054077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20168818533420563, "step": 29462 }, { "epoch": 0.58928, "grad_norm": 1.9140625, "grad_norm_var": 0.006648508707682291, "learning_rate": 0.0001, "loss": 3.7884, "loss/crossentropy": 1.668130338191986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1706988662481308, "step": 29464 }, { "epoch": 0.58932, "grad_norm": 1.9140625, "grad_norm_var": 0.006552886962890625, "learning_rate": 0.0001, "loss": 3.8763, "loss/crossentropy": 1.8372064232826233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17468038201332092, "step": 29466 }, { "epoch": 0.58936, "grad_norm": 1.8203125, "grad_norm_var": 0.006811269124348958, "learning_rate": 0.0001, "loss": 4.235, "loss/crossentropy": 2.234304904937744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845388039946556, "step": 29468 }, { "epoch": 0.5894, "grad_norm": 1.7421875, "grad_norm_var": 0.00662841796875, "learning_rate": 0.0001, "loss": 3.5877, "loss/crossentropy": 1.9101244807243347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17081473767757416, "step": 29470 }, { "epoch": 0.58944, "grad_norm": 1.8046875, "grad_norm_var": 0.004353586832682292, "learning_rate": 0.0001, "loss": 4.0295, "loss/crossentropy": 2.2079320549964905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952483355998993, "step": 29472 }, { "epoch": 0.58948, "grad_norm": 1.953125, "grad_norm_var": 0.00458984375, "learning_rate": 0.0001, "loss": 3.9308, "loss/crossentropy": 1.8550963997840881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18943903595209122, "step": 29474 }, { "epoch": 0.58952, "grad_norm": 1.9140625, "grad_norm_var": 0.004865519205729167, "learning_rate": 0.0001, "loss": 3.7654, "loss/crossentropy": 1.7661077976226807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19584594666957855, "step": 29476 }, { "epoch": 0.58956, "grad_norm": 1.9296875, "grad_norm_var": 0.005029042561848958, "learning_rate": 0.0001, "loss": 4.2088, "loss/crossentropy": 2.338898777961731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21608547866344452, "step": 29478 }, { "epoch": 0.5896, "grad_norm": 1.8203125, "grad_norm_var": 0.005242665608723958, "learning_rate": 0.0001, "loss": 4.1212, "loss/crossentropy": 2.3376599550247192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20884696394205093, "step": 29480 }, { "epoch": 0.58964, "grad_norm": 1.8125, "grad_norm_var": 0.005964152018229167, "learning_rate": 0.0001, "loss": 4.16, "loss/crossentropy": 2.241575002670288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200220599770546, "step": 29482 }, { "epoch": 0.58968, "grad_norm": 2.328125, "grad_norm_var": 0.017805735270182293, "learning_rate": 0.0001, "loss": 4.0612, "loss/crossentropy": 2.010516047477722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22505514323711395, "step": 29484 }, { "epoch": 0.58972, "grad_norm": 2.0625, "grad_norm_var": 0.017545318603515624, "learning_rate": 0.0001, "loss": 3.9692, "loss/crossentropy": 2.0848931670188904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21239446848630905, "step": 29486 }, { "epoch": 0.58976, "grad_norm": 1.921875, "grad_norm_var": 0.016315714518229166, "learning_rate": 0.0001, "loss": 4.1173, "loss/crossentropy": 2.0518574118614197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289197772741318, "step": 29488 }, { "epoch": 0.5898, "grad_norm": 2.0625, "grad_norm_var": 0.016460927327473958, "learning_rate": 0.0001, "loss": 4.2193, "loss/crossentropy": 2.1830244064331055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21049733459949493, "step": 29490 }, { "epoch": 0.58984, "grad_norm": 2.0, "grad_norm_var": 0.018833160400390625, "learning_rate": 0.0001, "loss": 4.416, "loss/crossentropy": 2.1932766437530518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886359080672264, "step": 29492 }, { "epoch": 0.58988, "grad_norm": 1.9921875, "grad_norm_var": 0.018595123291015626, "learning_rate": 0.0001, "loss": 4.1936, "loss/crossentropy": 2.0677687525749207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19729864597320557, "step": 29494 }, { "epoch": 0.58992, "grad_norm": 1.984375, "grad_norm_var": 0.0149169921875, "learning_rate": 0.0001, "loss": 4.1461, "loss/crossentropy": 2.181239366531372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087482586503029, "step": 29496 }, { "epoch": 0.58996, "grad_norm": 1.890625, "grad_norm_var": 0.013132476806640625, "learning_rate": 0.0001, "loss": 4.0357, "loss/crossentropy": 2.2734841108322144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22230970114469528, "step": 29498 }, { "epoch": 0.59, "grad_norm": 1.9921875, "grad_norm_var": 0.007454427083333334, "learning_rate": 0.0001, "loss": 4.1009, "loss/crossentropy": 2.1078773736953735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18602187931537628, "step": 29500 }, { "epoch": 0.59004, "grad_norm": 2.046875, "grad_norm_var": 0.009635416666666667, "learning_rate": 0.0001, "loss": 3.7355, "loss/crossentropy": 1.856580138206482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186849907040596, "step": 29502 }, { "epoch": 0.59008, "grad_norm": 1.8125, "grad_norm_var": 0.011842600504557292, "learning_rate": 0.0001, "loss": 3.6756, "loss/crossentropy": 1.90110582113266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18654846400022507, "step": 29504 }, { "epoch": 0.59012, "grad_norm": 1.9765625, "grad_norm_var": 0.0134918212890625, "learning_rate": 0.0001, "loss": 4.0382, "loss/crossentropy": 1.874162197113037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18833528459072113, "step": 29506 }, { "epoch": 0.59016, "grad_norm": 1.890625, "grad_norm_var": 0.009694163004557292, "learning_rate": 0.0001, "loss": 4.0473, "loss/crossentropy": 2.0299471020698547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19723627716302872, "step": 29508 }, { "epoch": 0.5902, "grad_norm": 1.859375, "grad_norm_var": 0.01085205078125, "learning_rate": 0.0001, "loss": 3.9377, "loss/crossentropy": 1.8558722138404846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18302107602357864, "step": 29510 }, { "epoch": 0.59024, "grad_norm": 1.9453125, "grad_norm_var": 0.010359446207682291, "learning_rate": 0.0001, "loss": 4.0567, "loss/crossentropy": 1.8753638863563538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061474472284317, "step": 29512 }, { "epoch": 0.59028, "grad_norm": 1.8125, "grad_norm_var": 0.011954498291015626, "learning_rate": 0.0001, "loss": 3.8181, "loss/crossentropy": 1.9646863341331482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18984466791152954, "step": 29514 }, { "epoch": 0.59032, "grad_norm": 1.8515625, "grad_norm_var": 0.0124755859375, "learning_rate": 0.0001, "loss": 3.7632, "loss/crossentropy": 1.9403189420700073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17063701152801514, "step": 29516 }, { "epoch": 0.59036, "grad_norm": 1.8359375, "grad_norm_var": 0.010774739583333333, "learning_rate": 0.0001, "loss": 3.7807, "loss/crossentropy": 1.9278589487075806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793733760714531, "step": 29518 }, { "epoch": 0.5904, "grad_norm": 2.28125, "grad_norm_var": 0.01877009073893229, "learning_rate": 0.0001, "loss": 4.109, "loss/crossentropy": 2.2447856664657593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21776193380355835, "step": 29520 }, { "epoch": 0.59044, "grad_norm": 1.84375, "grad_norm_var": 0.015572102864583333, "learning_rate": 0.0001, "loss": 4.0259, "loss/crossentropy": 2.3484312891960144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081959918141365, "step": 29522 }, { "epoch": 0.59048, "grad_norm": 2.015625, "grad_norm_var": 0.016434478759765624, "learning_rate": 0.0001, "loss": 3.9571, "loss/crossentropy": 2.121490001678467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033299133181572, "step": 29524 }, { "epoch": 0.59052, "grad_norm": 2.140625, "grad_norm_var": 0.017758941650390624, "learning_rate": 0.0001, "loss": 3.9742, "loss/crossentropy": 2.3825392723083496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22553616762161255, "step": 29526 }, { "epoch": 0.59056, "grad_norm": 1.9296875, "grad_norm_var": 0.01761652628580729, "learning_rate": 0.0001, "loss": 3.9717, "loss/crossentropy": 2.027899742126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18623250722885132, "step": 29528 }, { "epoch": 0.5906, "grad_norm": 2.03125, "grad_norm_var": 0.017732747395833335, "learning_rate": 0.0001, "loss": 4.3458, "loss/crossentropy": 2.231070041656494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22132613509893417, "step": 29530 }, { "epoch": 0.59064, "grad_norm": 2.015625, "grad_norm_var": 0.01613133748372396, "learning_rate": 0.0001, "loss": 4.1003, "loss/crossentropy": 2.16735976934433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951517015695572, "step": 29532 }, { "epoch": 0.59068, "grad_norm": 2.234375, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 3.8917, "loss/crossentropy": 1.8973720073699951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18189363181591034, "step": 29534 }, { "epoch": 0.59072, "grad_norm": 1.9375, "grad_norm_var": 0.01687800089518229, "learning_rate": 0.0001, "loss": 3.7249, "loss/crossentropy": 2.0840883255004883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19777310639619827, "step": 29536 }, { "epoch": 0.59076, "grad_norm": 2.03125, "grad_norm_var": 0.016291300455729168, "learning_rate": 0.0001, "loss": 3.8481, "loss/crossentropy": 1.7272367477416992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1722969114780426, "step": 29538 }, { "epoch": 0.5908, "grad_norm": 1.984375, "grad_norm_var": 0.015024566650390625, "learning_rate": 0.0001, "loss": 3.9883, "loss/crossentropy": 2.2268466353416443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19153434038162231, "step": 29540 }, { "epoch": 0.59084, "grad_norm": 1.921875, "grad_norm_var": 0.012739817301432291, "learning_rate": 0.0001, "loss": 4.0964, "loss/crossentropy": 2.4328715801239014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21078508347272873, "step": 29542 }, { "epoch": 0.59088, "grad_norm": 1.9921875, "grad_norm_var": 0.013325754801432292, "learning_rate": 0.0001, "loss": 4.0033, "loss/crossentropy": 1.9592428803443909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19666817784309387, "step": 29544 }, { "epoch": 0.59092, "grad_norm": 1.9296875, "grad_norm_var": 0.011620076497395833, "learning_rate": 0.0001, "loss": 4.203, "loss/crossentropy": 1.8915135860443115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18332552164793015, "step": 29546 }, { "epoch": 0.59096, "grad_norm": 1.890625, "grad_norm_var": 0.011834462483723959, "learning_rate": 0.0001, "loss": 3.884, "loss/crossentropy": 1.9134865403175354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2012096270918846, "step": 29548 }, { "epoch": 0.591, "grad_norm": 1.9765625, "grad_norm_var": 0.0061948140462239586, "learning_rate": 0.0001, "loss": 4.2232, "loss/crossentropy": 2.280379056930542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985026121139526, "step": 29550 }, { "epoch": 0.59104, "grad_norm": 1.921875, "grad_norm_var": 0.00355224609375, "learning_rate": 0.0001, "loss": 4.144, "loss/crossentropy": 2.000941574573517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18684983253479004, "step": 29552 }, { "epoch": 0.59108, "grad_norm": 1.7734375, "grad_norm_var": 0.005020904541015625, "learning_rate": 0.0001, "loss": 3.7859, "loss/crossentropy": 1.9749475121498108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18268440663814545, "step": 29554 }, { "epoch": 0.59112, "grad_norm": 1.9375, "grad_norm_var": 0.0055653889973958336, "learning_rate": 0.0001, "loss": 4.0483, "loss/crossentropy": 2.169970750808716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20792870223522186, "step": 29556 }, { "epoch": 0.59116, "grad_norm": 1.8671875, "grad_norm_var": 0.005712890625, "learning_rate": 0.0001, "loss": 4.0875, "loss/crossentropy": 2.059566855430603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1943632885813713, "step": 29558 }, { "epoch": 0.5912, "grad_norm": 2.53125, "grad_norm_var": 0.0299713134765625, "learning_rate": 0.0001, "loss": 4.0545, "loss/crossentropy": 1.80315762758255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15433073043823242, "step": 29560 }, { "epoch": 0.59124, "grad_norm": 2.109375, "grad_norm_var": 0.0330078125, "learning_rate": 0.0001, "loss": 3.9351, "loss/crossentropy": 2.1045247316360474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19684398919343948, "step": 29562 }, { "epoch": 0.59128, "grad_norm": 1.8671875, "grad_norm_var": 0.03242162068684896, "learning_rate": 0.0001, "loss": 4.1244, "loss/crossentropy": 2.2456302642822266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998264193534851, "step": 29564 }, { "epoch": 0.59132, "grad_norm": 1.96875, "grad_norm_var": 0.032389322916666664, "learning_rate": 0.0001, "loss": 3.8945, "loss/crossentropy": 2.105454444885254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18675578385591507, "step": 29566 }, { "epoch": 0.59136, "grad_norm": 2.0, "grad_norm_var": 0.03316624959309896, "learning_rate": 0.0001, "loss": 4.0291, "loss/crossentropy": 1.8126618266105652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19125473499298096, "step": 29568 }, { "epoch": 0.5914, "grad_norm": 1.8671875, "grad_norm_var": 0.032246907552083336, "learning_rate": 0.0001, "loss": 3.873, "loss/crossentropy": 1.8100510239601135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18192324042320251, "step": 29570 }, { "epoch": 0.59144, "grad_norm": 1.890625, "grad_norm_var": 0.031160481770833335, "learning_rate": 0.0001, "loss": 3.6655, "loss/crossentropy": 1.7124195098876953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16115359216928482, "step": 29572 }, { "epoch": 0.59148, "grad_norm": 1.9375, "grad_norm_var": 0.03163960774739583, "learning_rate": 0.0001, "loss": 3.9359, "loss/crossentropy": 2.2159979939460754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20147743076086044, "step": 29574 }, { "epoch": 0.59152, "grad_norm": 1.9609375, "grad_norm_var": 0.009722646077473958, "learning_rate": 0.0001, "loss": 4.2278, "loss/crossentropy": 2.195053517818451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20202681422233582, "step": 29576 }, { "epoch": 0.59156, "grad_norm": 1.8515625, "grad_norm_var": 0.007806142171223958, "learning_rate": 0.0001, "loss": 4.1261, "loss/crossentropy": 1.992495834827423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18599994480609894, "step": 29578 }, { "epoch": 0.5916, "grad_norm": 2.03125, "grad_norm_var": 0.008147939046223959, "learning_rate": 0.0001, "loss": 4.0063, "loss/crossentropy": 2.1876922845840454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197817362844944, "step": 29580 }, { "epoch": 0.59164, "grad_norm": 2.03125, "grad_norm_var": 0.008455403645833333, "learning_rate": 0.0001, "loss": 4.1484, "loss/crossentropy": 2.1962021589279175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175462618470192, "step": 29582 }, { "epoch": 0.59168, "grad_norm": 1.90625, "grad_norm_var": 0.011242421468098958, "learning_rate": 0.0001, "loss": 3.6774, "loss/crossentropy": 1.9215996265411377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18740177899599075, "step": 29584 }, { "epoch": 0.59172, "grad_norm": 1.9296875, "grad_norm_var": 0.007787068684895833, "learning_rate": 0.0001, "loss": 3.862, "loss/crossentropy": 1.6820173263549805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1574452817440033, "step": 29586 }, { "epoch": 0.59176, "grad_norm": 1.9296875, "grad_norm_var": 0.007877349853515625, "learning_rate": 0.0001, "loss": 4.0942, "loss/crossentropy": 2.1028271913528442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187875434756279, "step": 29588 }, { "epoch": 0.5918, "grad_norm": 1.859375, "grad_norm_var": 0.007865142822265626, "learning_rate": 0.0001, "loss": 3.7429, "loss/crossentropy": 1.9593722224235535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1871783211827278, "step": 29590 }, { "epoch": 0.59184, "grad_norm": 1.953125, "grad_norm_var": 0.007901763916015625, "learning_rate": 0.0001, "loss": 4.1708, "loss/crossentropy": 2.1829755902290344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22405223548412323, "step": 29592 }, { "epoch": 0.59188, "grad_norm": 2.03125, "grad_norm_var": 0.007330067952473958, "learning_rate": 0.0001, "loss": 3.9746, "loss/crossentropy": 1.9692201018333435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18905997276306152, "step": 29594 }, { "epoch": 0.59192, "grad_norm": 1.9765625, "grad_norm_var": 0.008931223551432292, "learning_rate": 0.0001, "loss": 4.1957, "loss/crossentropy": 2.341002106666565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21306651830673218, "step": 29596 }, { "epoch": 0.59196, "grad_norm": 2.03125, "grad_norm_var": 0.010453287760416667, "learning_rate": 0.0001, "loss": 4.1617, "loss/crossentropy": 1.9179102778434753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1874692291021347, "step": 29598 }, { "epoch": 0.592, "grad_norm": 2.328125, "grad_norm_var": 0.015421295166015625, "learning_rate": 0.0001, "loss": 4.036, "loss/crossentropy": 1.8973752856254578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19447319954633713, "step": 29600 }, { "epoch": 0.59204, "grad_norm": 1.8359375, "grad_norm_var": 0.016501617431640626, "learning_rate": 0.0001, "loss": 3.8856, "loss/crossentropy": 1.8788208365440369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16276411712169647, "step": 29602 }, { "epoch": 0.59208, "grad_norm": 1.9453125, "grad_norm_var": 0.01578547159830729, "learning_rate": 0.0001, "loss": 3.9585, "loss/crossentropy": 1.5552323460578918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1751468926668167, "step": 29604 }, { "epoch": 0.59212, "grad_norm": 1.921875, "grad_norm_var": 0.014273834228515626, "learning_rate": 0.0001, "loss": 4.0705, "loss/crossentropy": 1.906869649887085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18109184503555298, "step": 29606 }, { "epoch": 0.59216, "grad_norm": 1.8984375, "grad_norm_var": 0.014764149983723959, "learning_rate": 0.0001, "loss": 4.1151, "loss/crossentropy": 2.23270046710968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23064442723989487, "step": 29608 }, { "epoch": 0.5922, "grad_norm": 2.0625, "grad_norm_var": 0.01395263671875, "learning_rate": 0.0001, "loss": 4.0758, "loss/crossentropy": 2.187607169151306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993623673915863, "step": 29610 }, { "epoch": 0.59224, "grad_norm": 2.046875, "grad_norm_var": 0.013425445556640625, "learning_rate": 0.0001, "loss": 4.1025, "loss/crossentropy": 2.1429349184036255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19814343750476837, "step": 29612 }, { "epoch": 0.59228, "grad_norm": 1.859375, "grad_norm_var": 0.014899698893229167, "learning_rate": 0.0001, "loss": 3.6302, "loss/crossentropy": 1.9003735780715942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17693869769573212, "step": 29614 }, { "epoch": 0.59232, "grad_norm": 2.265625, "grad_norm_var": 0.0130523681640625, "learning_rate": 0.0001, "loss": 3.991, "loss/crossentropy": 2.2013859152793884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19231925159692764, "step": 29616 }, { "epoch": 0.59236, "grad_norm": 1.8671875, "grad_norm_var": 0.01234130859375, "learning_rate": 0.0001, "loss": 3.9533, "loss/crossentropy": 1.3149051070213318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1432316154241562, "step": 29618 }, { "epoch": 0.5924, "grad_norm": 1.8984375, "grad_norm_var": 0.013728841145833334, "learning_rate": 0.0001, "loss": 4.0781, "loss/crossentropy": 2.099441707134247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19838576018810272, "step": 29620 }, { "epoch": 0.59244, "grad_norm": 1.765625, "grad_norm_var": 0.015970865885416668, "learning_rate": 0.0001, "loss": 3.9236, "loss/crossentropy": 1.9029900431632996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029390037059784, "step": 29622 }, { "epoch": 0.59248, "grad_norm": 1.8203125, "grad_norm_var": 0.01672337849934896, "learning_rate": 0.0001, "loss": 4.0291, "loss/crossentropy": 2.159297227859497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991179883480072, "step": 29624 }, { "epoch": 0.59252, "grad_norm": 1.921875, "grad_norm_var": 0.014964803059895834, "learning_rate": 0.0001, "loss": 4.1908, "loss/crossentropy": 2.0404341220855713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894593983888626, "step": 29626 }, { "epoch": 0.59256, "grad_norm": 1.921875, "grad_norm_var": 0.012286122639973958, "learning_rate": 0.0001, "loss": 4.0989, "loss/crossentropy": 2.2824409008026123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19680628180503845, "step": 29628 }, { "epoch": 0.5926, "grad_norm": 1.921875, "grad_norm_var": 0.011905924479166666, "learning_rate": 0.0001, "loss": 4.131, "loss/crossentropy": 2.2231918573379517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1871175393462181, "step": 29630 }, { "epoch": 0.59264, "grad_norm": 1.796875, "grad_norm_var": 0.0030263264973958335, "learning_rate": 0.0001, "loss": 3.9817, "loss/crossentropy": 1.688015341758728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1651618853211403, "step": 29632 }, { "epoch": 0.59268, "grad_norm": 1.90625, "grad_norm_var": 0.003173828125, "learning_rate": 0.0001, "loss": 3.915, "loss/crossentropy": 1.8363903760910034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848619431257248, "step": 29634 }, { "epoch": 0.59272, "grad_norm": 1.90625, "grad_norm_var": 0.0029937744140625, "learning_rate": 0.0001, "loss": 3.7981, "loss/crossentropy": 2.2170976400375366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199843168258667, "step": 29636 }, { "epoch": 0.59276, "grad_norm": 1.8203125, "grad_norm_var": 0.0033770243326822916, "learning_rate": 0.0001, "loss": 4.0177, "loss/crossentropy": 1.707352876663208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17033107578754425, "step": 29638 }, { "epoch": 0.5928, "grad_norm": 1.96875, "grad_norm_var": 0.003295644124348958, "learning_rate": 0.0001, "loss": 4.1408, "loss/crossentropy": 2.418645143508911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20665095001459122, "step": 29640 }, { "epoch": 0.59284, "grad_norm": 2.046875, "grad_norm_var": 0.00450439453125, "learning_rate": 0.0001, "loss": 4.0686, "loss/crossentropy": 2.0012764930725098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17807253450155258, "step": 29642 }, { "epoch": 0.59288, "grad_norm": 1.859375, "grad_norm_var": 0.0054013570149739586, "learning_rate": 0.0001, "loss": 4.0358, "loss/crossentropy": 2.4282820224761963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22419389337301254, "step": 29644 }, { "epoch": 0.59292, "grad_norm": 1.90625, "grad_norm_var": 0.005669911702473958, "learning_rate": 0.0001, "loss": 3.9593, "loss/crossentropy": 1.9802210927009583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18829302489757538, "step": 29646 }, { "epoch": 0.59296, "grad_norm": 1.8671875, "grad_norm_var": 0.004959869384765625, "learning_rate": 0.0001, "loss": 4.1733, "loss/crossentropy": 2.1611366271972656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20142409205436707, "step": 29648 }, { "epoch": 0.593, "grad_norm": 1.84375, "grad_norm_var": 0.006009674072265625, "learning_rate": 0.0001, "loss": 3.7293, "loss/crossentropy": 2.0067127346992493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892910748720169, "step": 29650 }, { "epoch": 0.59304, "grad_norm": 1.875, "grad_norm_var": 0.006591796875, "learning_rate": 0.0001, "loss": 3.9099, "loss/crossentropy": 1.8918231129646301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17595022916793823, "step": 29652 }, { "epoch": 0.59308, "grad_norm": 1.9296875, "grad_norm_var": 0.00543212890625, "learning_rate": 0.0001, "loss": 4.0838, "loss/crossentropy": 2.310000419616699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20750866830348969, "step": 29654 }, { "epoch": 0.59312, "grad_norm": 1.953125, "grad_norm_var": 0.005448404947916667, "learning_rate": 0.0001, "loss": 3.7038, "loss/crossentropy": 2.0080875158309937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18810434639453888, "step": 29656 }, { "epoch": 0.59316, "grad_norm": 1.8359375, "grad_norm_var": 0.0039670308430989586, "learning_rate": 0.0001, "loss": 3.8419, "loss/crossentropy": 1.9152463674545288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17956481873989105, "step": 29658 }, { "epoch": 0.5932, "grad_norm": 1.828125, "grad_norm_var": 0.002982330322265625, "learning_rate": 0.0001, "loss": 3.9465, "loss/crossentropy": 2.019734025001526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1654059886932373, "step": 29660 }, { "epoch": 0.59324, "grad_norm": 1.9609375, "grad_norm_var": 0.003242746988932292, "learning_rate": 0.0001, "loss": 3.9672, "loss/crossentropy": 1.9415631294250488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18466275930404663, "step": 29662 }, { "epoch": 0.59328, "grad_norm": 1.8671875, "grad_norm_var": 0.0037737528483072916, "learning_rate": 0.0001, "loss": 4.0275, "loss/crossentropy": 2.246294617652893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20313822478055954, "step": 29664 }, { "epoch": 0.59332, "grad_norm": 2.0625, "grad_norm_var": 0.005242665608723958, "learning_rate": 0.0001, "loss": 4.1661, "loss/crossentropy": 1.8673067688941956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1858188509941101, "step": 29666 }, { "epoch": 0.59336, "grad_norm": 1.8671875, "grad_norm_var": 0.00509033203125, "learning_rate": 0.0001, "loss": 4.1556, "loss/crossentropy": 1.9696037769317627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936260536313057, "step": 29668 }, { "epoch": 0.5934, "grad_norm": 2.015625, "grad_norm_var": 0.10892512003580729, "learning_rate": 0.0001, "loss": 4.1505, "loss/crossentropy": 2.2883976697921753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22077778726816177, "step": 29670 }, { "epoch": 0.59344, "grad_norm": 1.8515625, "grad_norm_var": 0.1094378153483073, "learning_rate": 0.0001, "loss": 3.9617, "loss/crossentropy": 2.11427104473114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877901256084442, "step": 29672 }, { "epoch": 0.59348, "grad_norm": 1.8359375, "grad_norm_var": 0.1084307352701823, "learning_rate": 0.0001, "loss": 4.02, "loss/crossentropy": 2.256463050842285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980907917022705, "step": 29674 }, { "epoch": 0.59352, "grad_norm": 2.125, "grad_norm_var": 0.10796890258789063, "learning_rate": 0.0001, "loss": 3.926, "loss/crossentropy": 2.0554774403572083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19413713365793228, "step": 29676 }, { "epoch": 0.59356, "grad_norm": 2.015625, "grad_norm_var": 0.10651626586914062, "learning_rate": 0.0001, "loss": 3.9095, "loss/crossentropy": 2.0619908571243286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20501630008220673, "step": 29678 }, { "epoch": 0.5936, "grad_norm": 1.8671875, "grad_norm_var": 0.10798314412434896, "learning_rate": 0.0001, "loss": 3.8964, "loss/crossentropy": 2.0001166462898254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1859305128455162, "step": 29680 }, { "epoch": 0.59364, "grad_norm": 1.84375, "grad_norm_var": 0.11613667805989583, "learning_rate": 0.0001, "loss": 3.7957, "loss/crossentropy": 2.0025678277015686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18820206075906754, "step": 29682 }, { "epoch": 0.59368, "grad_norm": 1.84375, "grad_norm_var": 0.11686604817708333, "learning_rate": 0.0001, "loss": 4.0391, "loss/crossentropy": 2.0470725297927856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986207813024521, "step": 29684 }, { "epoch": 0.59372, "grad_norm": 2.015625, "grad_norm_var": 0.010619862874348959, "learning_rate": 0.0001, "loss": 3.7557, "loss/crossentropy": 1.9196028113365173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19124136865139008, "step": 29686 }, { "epoch": 0.59376, "grad_norm": 1.96875, "grad_norm_var": 0.013315582275390625, "learning_rate": 0.0001, "loss": 3.6694, "loss/crossentropy": 1.624564528465271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15498200803995132, "step": 29688 }, { "epoch": 0.5938, "grad_norm": 1.8828125, "grad_norm_var": 0.012060546875, "learning_rate": 0.0001, "loss": 4.0355, "loss/crossentropy": 2.147800922393799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418001919984818, "step": 29690 }, { "epoch": 0.59384, "grad_norm": 1.953125, "grad_norm_var": 0.008719635009765626, "learning_rate": 0.0001, "loss": 4.0047, "loss/crossentropy": 2.1515848636627197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17781265825033188, "step": 29692 }, { "epoch": 0.59388, "grad_norm": 1.9765625, "grad_norm_var": 0.0083740234375, "learning_rate": 0.0001, "loss": 3.9553, "loss/crossentropy": 2.034223735332489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19139179587364197, "step": 29694 }, { "epoch": 0.59392, "grad_norm": 2.0, "grad_norm_var": 0.009226226806640625, "learning_rate": 0.0001, "loss": 3.9402, "loss/crossentropy": 1.8839016556739807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837429627776146, "step": 29696 }, { "epoch": 0.59396, "grad_norm": 1.96875, "grad_norm_var": 0.006878407796223959, "learning_rate": 0.0001, "loss": 4.2001, "loss/crossentropy": 2.367957592010498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21486172825098038, "step": 29698 }, { "epoch": 0.594, "grad_norm": 1.7890625, "grad_norm_var": 0.008062489827473958, "learning_rate": 0.0001, "loss": 3.8494, "loss/crossentropy": 2.02916818857193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17551806569099426, "step": 29700 }, { "epoch": 0.59404, "grad_norm": 1.8515625, "grad_norm_var": 0.009384918212890624, "learning_rate": 0.0001, "loss": 4.1648, "loss/crossentropy": 1.8610569834709167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17259076982736588, "step": 29702 }, { "epoch": 0.59408, "grad_norm": 1.8203125, "grad_norm_var": 0.007420857747395833, "learning_rate": 0.0001, "loss": 4.0775, "loss/crossentropy": 1.8655198812484741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17619968950748444, "step": 29704 }, { "epoch": 0.59412, "grad_norm": 1.9609375, "grad_norm_var": 0.007775624593098958, "learning_rate": 0.0001, "loss": 4.0338, "loss/crossentropy": 1.7603416442871094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15166223049163818, "step": 29706 }, { "epoch": 0.59416, "grad_norm": 1.8984375, "grad_norm_var": 0.007726796468098958, "learning_rate": 0.0001, "loss": 3.8432, "loss/crossentropy": 1.9816042184829712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975991204380989, "step": 29708 }, { "epoch": 0.5942, "grad_norm": 1.9453125, "grad_norm_var": 0.0070574442545572914, "learning_rate": 0.0001, "loss": 4.1009, "loss/crossentropy": 2.2220958471298218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20013637095689774, "step": 29710 }, { "epoch": 0.59424, "grad_norm": 2.203125, "grad_norm_var": 0.012719472249348959, "learning_rate": 0.0001, "loss": 4.0315, "loss/crossentropy": 1.947887122631073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19075323641300201, "step": 29712 }, { "epoch": 0.59428, "grad_norm": 1.796875, "grad_norm_var": 0.014286295572916666, "learning_rate": 0.0001, "loss": 4.0852, "loss/crossentropy": 2.3358702659606934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21242079883813858, "step": 29714 }, { "epoch": 0.59432, "grad_norm": 1.828125, "grad_norm_var": 0.01927464803059896, "learning_rate": 0.0001, "loss": 4.1379, "loss/crossentropy": 1.746841311454773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843656674027443, "step": 29716 }, { "epoch": 0.59436, "grad_norm": 1.875, "grad_norm_var": 0.0187255859375, "learning_rate": 0.0001, "loss": 4.1713, "loss/crossentropy": 2.283482313156128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116861194372177, "step": 29718 }, { "epoch": 0.5944, "grad_norm": 1.90625, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 4.0167, "loss/crossentropy": 1.927943468093872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18120041489601135, "step": 29720 }, { "epoch": 0.59444, "grad_norm": 1.8046875, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 3.6467, "loss/crossentropy": 1.8562633395195007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756514087319374, "step": 29722 }, { "epoch": 0.59448, "grad_norm": 1.8359375, "grad_norm_var": 0.021862538655598958, "learning_rate": 0.0001, "loss": 3.7351, "loss/crossentropy": 2.0243565440177917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839814931154251, "step": 29724 }, { "epoch": 0.59452, "grad_norm": 1.8671875, "grad_norm_var": 0.023227691650390625, "learning_rate": 0.0001, "loss": 4.0603, "loss/crossentropy": 2.136290669441223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19461742788553238, "step": 29726 }, { "epoch": 0.59456, "grad_norm": 1.9296875, "grad_norm_var": 0.017308553059895832, "learning_rate": 0.0001, "loss": 4.1118, "loss/crossentropy": 2.1178980469703674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19608290493488312, "step": 29728 }, { "epoch": 0.5946, "grad_norm": 2.015625, "grad_norm_var": 0.016209920247395832, "learning_rate": 0.0001, "loss": 4.1603, "loss/crossentropy": 2.1932308673858643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19451629370450974, "step": 29730 }, { "epoch": 0.59464, "grad_norm": 2.21875, "grad_norm_var": 0.014890289306640625, "learning_rate": 0.0001, "loss": 4.1111, "loss/crossentropy": 2.140673279762268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19651110470294952, "step": 29732 }, { "epoch": 0.59468, "grad_norm": 1.953125, "grad_norm_var": 0.013423411051432292, "learning_rate": 0.0001, "loss": 4.0438, "loss/crossentropy": 2.174296498298645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2079356089234352, "step": 29734 }, { "epoch": 0.59472, "grad_norm": 1.953125, "grad_norm_var": 0.015730539957682293, "learning_rate": 0.0001, "loss": 4.0483, "loss/crossentropy": 1.7323570847511292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18022552132606506, "step": 29736 }, { "epoch": 0.59476, "grad_norm": 1.90625, "grad_norm_var": 0.011229451497395833, "learning_rate": 0.0001, "loss": 3.9672, "loss/crossentropy": 2.0798590779304504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19130367040634155, "step": 29738 }, { "epoch": 0.5948, "grad_norm": 1.9921875, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 3.9728, "loss/crossentropy": 2.1414352655410767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19515492022037506, "step": 29740 }, { "epoch": 0.59484, "grad_norm": 1.875, "grad_norm_var": 0.01082763671875, "learning_rate": 0.0001, "loss": 3.8085, "loss/crossentropy": 2.0827420949935913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836787909269333, "step": 29742 }, { "epoch": 0.59488, "grad_norm": 2.015625, "grad_norm_var": 0.010960896809895834, "learning_rate": 0.0001, "loss": 4.1457, "loss/crossentropy": 2.2884727716445923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20683631300926208, "step": 29744 }, { "epoch": 0.59492, "grad_norm": 1.8984375, "grad_norm_var": 0.010838826497395834, "learning_rate": 0.0001, "loss": 3.9837, "loss/crossentropy": 1.6755734086036682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17120809853076935, "step": 29746 }, { "epoch": 0.59496, "grad_norm": 1.8671875, "grad_norm_var": 0.0085205078125, "learning_rate": 0.0001, "loss": 4.1012, "loss/crossentropy": 1.8576670289039612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19007987529039383, "step": 29748 }, { "epoch": 0.595, "grad_norm": 1.9140625, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 4.0576, "loss/crossentropy": 1.9877051711082458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18861526250839233, "step": 29750 }, { "epoch": 0.59504, "grad_norm": 1.90625, "grad_norm_var": 0.0072591145833333336, "learning_rate": 0.0001, "loss": 3.9335, "loss/crossentropy": 2.4191300868988037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063479870557785, "step": 29752 }, { "epoch": 0.59508, "grad_norm": 1.9765625, "grad_norm_var": 0.007279459635416667, "learning_rate": 0.0001, "loss": 4.1469, "loss/crossentropy": 2.145687997341156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19573777168989182, "step": 29754 }, { "epoch": 0.59512, "grad_norm": 1.6640625, "grad_norm_var": 0.010823313395182292, "learning_rate": 0.0001, "loss": 3.7671, "loss/crossentropy": 1.956217110157013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17622292786836624, "step": 29756 }, { "epoch": 0.59516, "grad_norm": 1.8125, "grad_norm_var": 0.013716634114583333, "learning_rate": 0.0001, "loss": 3.976, "loss/crossentropy": 2.0639008283615112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18797758221626282, "step": 29758 }, { "epoch": 0.5952, "grad_norm": 1.8984375, "grad_norm_var": 0.013073476155598958, "learning_rate": 0.0001, "loss": 3.9458, "loss/crossentropy": 2.0404372811317444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18267738819122314, "step": 29760 }, { "epoch": 0.59524, "grad_norm": 2.09375, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 3.9625, "loss/crossentropy": 2.0870869159698486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18522879481315613, "step": 29762 }, { "epoch": 0.59528, "grad_norm": 2.015625, "grad_norm_var": 0.013602701822916667, "learning_rate": 0.0001, "loss": 4.0244, "loss/crossentropy": 2.0854042172431946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.172662615776062, "step": 29764 }, { "epoch": 0.59532, "grad_norm": 1.8828125, "grad_norm_var": 0.014050038655598958, "learning_rate": 0.0001, "loss": 3.9533, "loss/crossentropy": 2.012739658355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193946361541748, "step": 29766 }, { "epoch": 0.59536, "grad_norm": 1.8984375, "grad_norm_var": 0.013834635416666666, "learning_rate": 0.0001, "loss": 3.7903, "loss/crossentropy": 1.8651488423347473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19326791167259216, "step": 29768 }, { "epoch": 0.5954, "grad_norm": 2.046875, "grad_norm_var": 0.015258534749348959, "learning_rate": 0.0001, "loss": 3.9921, "loss/crossentropy": 1.8395410776138306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17881877720355988, "step": 29770 }, { "epoch": 0.59544, "grad_norm": 1.8984375, "grad_norm_var": 0.010518137613932292, "learning_rate": 0.0001, "loss": 3.7816, "loss/crossentropy": 1.7089250087738037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19955646991729736, "step": 29772 }, { "epoch": 0.59548, "grad_norm": 1.890625, "grad_norm_var": 0.007122548421223959, "learning_rate": 0.0001, "loss": 4.0978, "loss/crossentropy": 2.233728766441345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20136447995901108, "step": 29774 }, { "epoch": 0.59552, "grad_norm": 1.8125, "grad_norm_var": 0.008324178059895833, "learning_rate": 0.0001, "loss": 4.0012, "loss/crossentropy": 2.104355573654175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19211959093809128, "step": 29776 }, { "epoch": 0.59556, "grad_norm": 1.9609375, "grad_norm_var": 0.0061075846354166664, "learning_rate": 0.0001, "loss": 3.931, "loss/crossentropy": 1.9304287433624268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887596696615219, "step": 29778 }, { "epoch": 0.5956, "grad_norm": 1.890625, "grad_norm_var": 0.004878489176432291, "learning_rate": 0.0001, "loss": 3.9997, "loss/crossentropy": 1.9446918368339539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17917344719171524, "step": 29780 }, { "epoch": 0.59564, "grad_norm": 1.8828125, "grad_norm_var": 0.004196929931640625, "learning_rate": 0.0001, "loss": 4.0013, "loss/crossentropy": 2.2256661653518677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21204601228237152, "step": 29782 }, { "epoch": 0.59568, "grad_norm": 2.109375, "grad_norm_var": 0.0061431884765625, "learning_rate": 0.0001, "loss": 4.0934, "loss/crossentropy": 2.1072696447372437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179524451494217, "step": 29784 }, { "epoch": 0.59572, "grad_norm": 1.8984375, "grad_norm_var": 0.0048411051432291664, "learning_rate": 0.0001, "loss": 4.0207, "loss/crossentropy": 2.2444673776626587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19245705753564835, "step": 29786 }, { "epoch": 0.59576, "grad_norm": 2.09375, "grad_norm_var": 0.006833648681640625, "learning_rate": 0.0001, "loss": 4.2724, "loss/crossentropy": 2.5239516496658325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22635143250226974, "step": 29788 }, { "epoch": 0.5958, "grad_norm": 1.921875, "grad_norm_var": 0.006883748372395833, "learning_rate": 0.0001, "loss": 3.9709, "loss/crossentropy": 1.7270027995109558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18143896013498306, "step": 29790 }, { "epoch": 0.59584, "grad_norm": 1.859375, "grad_norm_var": 0.006170399983723958, "learning_rate": 0.0001, "loss": 3.9258, "loss/crossentropy": 1.9974133372306824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19476915150880814, "step": 29792 }, { "epoch": 0.59588, "grad_norm": 2.015625, "grad_norm_var": 0.006601715087890625, "learning_rate": 0.0001, "loss": 3.999, "loss/crossentropy": 1.9523651003837585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17940453439950943, "step": 29794 }, { "epoch": 0.59592, "grad_norm": 1.84375, "grad_norm_var": 0.0069580078125, "learning_rate": 0.0001, "loss": 3.7075, "loss/crossentropy": 1.8218488097190857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17136383056640625, "step": 29796 }, { "epoch": 0.59596, "grad_norm": 1.9140625, "grad_norm_var": 0.007098134358723958, "learning_rate": 0.0001, "loss": 4.0404, "loss/crossentropy": 2.1148566603660583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21685151010751724, "step": 29798 }, { "epoch": 0.596, "grad_norm": 1.8671875, "grad_norm_var": 0.004833984375, "learning_rate": 0.0001, "loss": 4.0628, "loss/crossentropy": 2.4410229921340942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20404956489801407, "step": 29800 }, { "epoch": 0.59604, "grad_norm": 1.953125, "grad_norm_var": 0.005378214518229166, "learning_rate": 0.0001, "loss": 3.8664, "loss/crossentropy": 1.9751408100128174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18364927917718887, "step": 29802 }, { "epoch": 0.59608, "grad_norm": 1.8671875, "grad_norm_var": 0.0031402587890625, "learning_rate": 0.0001, "loss": 3.861, "loss/crossentropy": 2.2535789012908936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096666395664215, "step": 29804 }, { "epoch": 0.59612, "grad_norm": 1.7734375, "grad_norm_var": 0.004142252604166666, "learning_rate": 0.0001, "loss": 3.817, "loss/crossentropy": 2.2020750641822815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21073434501886368, "step": 29806 }, { "epoch": 0.59616, "grad_norm": 1.8515625, "grad_norm_var": 0.0040934244791666664, "learning_rate": 0.0001, "loss": 4.0591, "loss/crossentropy": 2.0761327147483826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929018497467041, "step": 29808 }, { "epoch": 0.5962, "grad_norm": 2.015625, "grad_norm_var": 0.004064687093098958, "learning_rate": 0.0001, "loss": 4.1519, "loss/crossentropy": 2.014641046524048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975310891866684, "step": 29810 }, { "epoch": 0.59624, "grad_norm": 1.921875, "grad_norm_var": 0.005182902018229167, "learning_rate": 0.0001, "loss": 3.8542, "loss/crossentropy": 1.9420717358589172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1935308501124382, "step": 29812 }, { "epoch": 0.59628, "grad_norm": 1.8671875, "grad_norm_var": 0.005399576822916667, "learning_rate": 0.0001, "loss": 3.829, "loss/crossentropy": 1.7394835352897644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16872025281190872, "step": 29814 }, { "epoch": 0.59632, "grad_norm": 1.9296875, "grad_norm_var": 0.008980305989583333, "learning_rate": 0.0001, "loss": 4.012, "loss/crossentropy": 2.0532928705215454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20437174290418625, "step": 29816 }, { "epoch": 0.59636, "grad_norm": 1.984375, "grad_norm_var": 0.008292388916015626, "learning_rate": 0.0001, "loss": 3.8792, "loss/crossentropy": 1.7072120904922485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16668207198381424, "step": 29818 }, { "epoch": 0.5964, "grad_norm": 1.953125, "grad_norm_var": 0.008063761393229167, "learning_rate": 0.0001, "loss": 4.2173, "loss/crossentropy": 2.362375855445862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20404937118291855, "step": 29820 }, { "epoch": 0.59644, "grad_norm": 1.8359375, "grad_norm_var": 0.006894683837890625, "learning_rate": 0.0001, "loss": 3.9963, "loss/crossentropy": 2.334542155265808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21750033646821976, "step": 29822 }, { "epoch": 0.59648, "grad_norm": 1.9765625, "grad_norm_var": 0.006463368733723958, "learning_rate": 0.0001, "loss": 4.0611, "loss/crossentropy": 2.1138614416122437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2092488631606102, "step": 29824 }, { "epoch": 0.59652, "grad_norm": 1.921875, "grad_norm_var": 0.006184895833333333, "learning_rate": 0.0001, "loss": 3.8785, "loss/crossentropy": 1.7644943594932556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18817196786403656, "step": 29826 }, { "epoch": 0.59656, "grad_norm": 1.9453125, "grad_norm_var": 0.005407460530598958, "learning_rate": 0.0001, "loss": 4.1779, "loss/crossentropy": 2.1820908784866333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20322736352682114, "step": 29828 }, { "epoch": 0.5966, "grad_norm": 1.953125, "grad_norm_var": 0.0041656494140625, "learning_rate": 0.0001, "loss": 3.8968, "loss/crossentropy": 2.146401524543762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19272416085004807, "step": 29830 }, { "epoch": 0.59664, "grad_norm": 1.984375, "grad_norm_var": 0.0023943583170572915, "learning_rate": 0.0001, "loss": 3.9237, "loss/crossentropy": 1.8045800924301147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1904245838522911, "step": 29832 }, { "epoch": 0.59668, "grad_norm": 1.765625, "grad_norm_var": 0.00426025390625, "learning_rate": 0.0001, "loss": 3.9487, "loss/crossentropy": 2.107872247695923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18807457387447357, "step": 29834 }, { "epoch": 0.59672, "grad_norm": 1.7734375, "grad_norm_var": 0.006048329671223958, "learning_rate": 0.0001, "loss": 3.9153, "loss/crossentropy": 2.2815098762512207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112163081765175, "step": 29836 }, { "epoch": 0.59676, "grad_norm": 2.0, "grad_norm_var": 0.006145985921223959, "learning_rate": 0.0001, "loss": 4.1667, "loss/crossentropy": 2.242028594017029, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2006349116563797, "step": 29838 }, { "epoch": 0.5968, "grad_norm": 1.953125, "grad_norm_var": 0.006029256184895833, "learning_rate": 0.0001, "loss": 3.9325, "loss/crossentropy": 2.2297927141189575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074139192700386, "step": 29840 }, { "epoch": 0.59684, "grad_norm": 2.046875, "grad_norm_var": 0.007071940104166666, "learning_rate": 0.0001, "loss": 4.1403, "loss/crossentropy": 2.2018691301345825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21267356723546982, "step": 29842 }, { "epoch": 0.59688, "grad_norm": 1.828125, "grad_norm_var": 0.007819620768229167, "learning_rate": 0.0001, "loss": 3.8575, "loss/crossentropy": 2.0878920555114746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18239793926477432, "step": 29844 }, { "epoch": 0.59692, "grad_norm": 1.9609375, "grad_norm_var": 0.007917277018229167, "learning_rate": 0.0001, "loss": 4.0925, "loss/crossentropy": 2.0679028034210205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20400486141443253, "step": 29846 }, { "epoch": 0.59696, "grad_norm": 1.6328125, "grad_norm_var": 0.010227203369140625, "learning_rate": 0.0001, "loss": 3.781, "loss/crossentropy": 2.0496456623077393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2035546898841858, "step": 29848 }, { "epoch": 0.597, "grad_norm": 1.90625, "grad_norm_var": 0.009325917561848958, "learning_rate": 0.0001, "loss": 4.0158, "loss/crossentropy": 2.0204665660858154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1830250397324562, "step": 29850 }, { "epoch": 0.59704, "grad_norm": 1.796875, "grad_norm_var": 0.009325154622395833, "learning_rate": 0.0001, "loss": 3.6884, "loss/crossentropy": 2.0521149039268494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18589533865451813, "step": 29852 }, { "epoch": 0.59708, "grad_norm": 1.84375, "grad_norm_var": 0.009506988525390624, "learning_rate": 0.0001, "loss": 3.6348, "loss/crossentropy": 1.8169047832489014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17271429300308228, "step": 29854 }, { "epoch": 0.59712, "grad_norm": 1.84375, "grad_norm_var": 0.008514149983723959, "learning_rate": 0.0001, "loss": 4.0186, "loss/crossentropy": 2.029230833053589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18239567428827286, "step": 29856 }, { "epoch": 0.59716, "grad_norm": 1.71875, "grad_norm_var": 0.0059893290201822914, "learning_rate": 0.0001, "loss": 3.97, "loss/crossentropy": 2.0296765565872192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822112277150154, "step": 29858 }, { "epoch": 0.5972, "grad_norm": 1.859375, "grad_norm_var": 0.00657958984375, "learning_rate": 0.0001, "loss": 3.9411, "loss/crossentropy": 1.9933258891105652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953386887907982, "step": 29860 }, { "epoch": 0.59724, "grad_norm": 2.171875, "grad_norm_var": 0.014229329427083333, "learning_rate": 0.0001, "loss": 3.9819, "loss/crossentropy": 1.9453927278518677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19596701860427856, "step": 29862 }, { "epoch": 0.59728, "grad_norm": 1.890625, "grad_norm_var": 0.011185455322265624, "learning_rate": 0.0001, "loss": 3.8056, "loss/crossentropy": 1.714016318321228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17374874651432037, "step": 29864 }, { "epoch": 0.59732, "grad_norm": 1.8671875, "grad_norm_var": 0.01087646484375, "learning_rate": 0.0001, "loss": 3.6872, "loss/crossentropy": 2.0186676383018494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19331404566764832, "step": 29866 }, { "epoch": 0.59736, "grad_norm": 1.71875, "grad_norm_var": 0.011922200520833334, "learning_rate": 0.0001, "loss": 3.8642, "loss/crossentropy": 1.9068130850791931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18482722342014313, "step": 29868 }, { "epoch": 0.5974, "grad_norm": 1.90625, "grad_norm_var": 0.014092763264973959, "learning_rate": 0.0001, "loss": 4.0994, "loss/crossentropy": 2.192137658596039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19123639166355133, "step": 29870 }, { "epoch": 0.59744, "grad_norm": 1.7734375, "grad_norm_var": 0.014788564046223958, "learning_rate": 0.0001, "loss": 3.5734, "loss/crossentropy": 1.859050452709198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19843916594982147, "step": 29872 }, { "epoch": 0.59748, "grad_norm": 2.28125, "grad_norm_var": 0.021469879150390624, "learning_rate": 0.0001, "loss": 4.1619, "loss/crossentropy": 1.5558834075927734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17953873425722122, "step": 29874 }, { "epoch": 0.59752, "grad_norm": 1.8828125, "grad_norm_var": 0.021274566650390625, "learning_rate": 0.0001, "loss": 4.2012, "loss/crossentropy": 2.1445621252059937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20609040558338165, "step": 29876 }, { "epoch": 0.59756, "grad_norm": 1.8984375, "grad_norm_var": 0.01730931599934896, "learning_rate": 0.0001, "loss": 4.0528, "loss/crossentropy": 1.8768232464790344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16527344286441803, "step": 29878 }, { "epoch": 0.5976, "grad_norm": 1.796875, "grad_norm_var": 0.018024698893229166, "learning_rate": 0.0001, "loss": 3.8604, "loss/crossentropy": 1.830302894115448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17598530650138855, "step": 29880 }, { "epoch": 0.59764, "grad_norm": 1.875, "grad_norm_var": 0.018115234375, "learning_rate": 0.0001, "loss": 3.9474, "loss/crossentropy": 2.0607933402061462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18962115049362183, "step": 29882 }, { "epoch": 0.59768, "grad_norm": 1.984375, "grad_norm_var": 0.015657552083333335, "learning_rate": 0.0001, "loss": 3.8516, "loss/crossentropy": 2.2717671394348145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21351712197065353, "step": 29884 }, { "epoch": 0.59772, "grad_norm": 1.765625, "grad_norm_var": 0.014383951822916666, "learning_rate": 0.0001, "loss": 3.8209, "loss/crossentropy": 2.0096693634986877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1812792867422104, "step": 29886 }, { "epoch": 0.59776, "grad_norm": 1.90625, "grad_norm_var": 0.0130035400390625, "learning_rate": 0.0001, "loss": 3.8955, "loss/crossentropy": 1.9364767670631409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19597072899341583, "step": 29888 }, { "epoch": 0.5978, "grad_norm": 1.9296875, "grad_norm_var": 0.003348541259765625, "learning_rate": 0.0001, "loss": 4.1403, "loss/crossentropy": 2.1214417219161987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933349445462227, "step": 29890 }, { "epoch": 0.59784, "grad_norm": 1.8984375, "grad_norm_var": 0.007013956705729167, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 1.9339659214019775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958373337984085, "step": 29892 }, { "epoch": 0.59788, "grad_norm": 2.0625, "grad_norm_var": 0.009913889567057292, "learning_rate": 0.0001, "loss": 4.3786, "loss/crossentropy": 1.940473735332489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18802516907453537, "step": 29894 }, { "epoch": 0.59792, "grad_norm": 1.8515625, "grad_norm_var": 0.009230295817057291, "learning_rate": 0.0001, "loss": 3.8581, "loss/crossentropy": 2.048341751098633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18541985005140305, "step": 29896 }, { "epoch": 0.59796, "grad_norm": 1.9296875, "grad_norm_var": 0.008780924479166667, "learning_rate": 0.0001, "loss": 4.1022, "loss/crossentropy": 1.9213348031044006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17794149369001389, "step": 29898 }, { "epoch": 0.598, "grad_norm": 1.8828125, "grad_norm_var": 0.008807118733723958, "learning_rate": 0.0001, "loss": 4.0137, "loss/crossentropy": 2.160265564918518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19693263620138168, "step": 29900 }, { "epoch": 0.59804, "grad_norm": 1.90625, "grad_norm_var": 0.007718658447265625, "learning_rate": 0.0001, "loss": 4.1329, "loss/crossentropy": 2.3574010133743286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20197532325983047, "step": 29902 }, { "epoch": 0.59808, "grad_norm": 1.90625, "grad_norm_var": 0.007505035400390625, "learning_rate": 0.0001, "loss": 4.0925, "loss/crossentropy": 2.2992867827415466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21149636805057526, "step": 29904 }, { "epoch": 0.59812, "grad_norm": 2.015625, "grad_norm_var": 0.007328033447265625, "learning_rate": 0.0001, "loss": 4.0631, "loss/crossentropy": 2.1905906200408936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21494387090206146, "step": 29906 }, { "epoch": 0.59816, "grad_norm": 1.9140625, "grad_norm_var": 0.007808176676432291, "learning_rate": 0.0001, "loss": 3.6926, "loss/crossentropy": 2.0027430057525635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19078665226697922, "step": 29908 }, { "epoch": 0.5982, "grad_norm": 2.046875, "grad_norm_var": 0.007346343994140625, "learning_rate": 0.0001, "loss": 4.0699, "loss/crossentropy": 2.5675058364868164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21340946853160858, "step": 29910 }, { "epoch": 0.59824, "grad_norm": 1.8359375, "grad_norm_var": 0.007746378580729167, "learning_rate": 0.0001, "loss": 4.0047, "loss/crossentropy": 1.6819973587989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17643968760967255, "step": 29912 }, { "epoch": 0.59828, "grad_norm": 1.953125, "grad_norm_var": 0.008499908447265624, "learning_rate": 0.0001, "loss": 4.0427, "loss/crossentropy": 1.9997480511665344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1763923391699791, "step": 29914 }, { "epoch": 0.59832, "grad_norm": 1.9296875, "grad_norm_var": 0.009580230712890625, "learning_rate": 0.0001, "loss": 3.9184, "loss/crossentropy": 2.02975070476532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19244793057441711, "step": 29916 }, { "epoch": 0.59836, "grad_norm": 1.984375, "grad_norm_var": 0.009405263264973958, "learning_rate": 0.0001, "loss": 3.903, "loss/crossentropy": 1.9599812030792236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20076382905244827, "step": 29918 }, { "epoch": 0.5984, "grad_norm": 2.0625, "grad_norm_var": 0.010119374593098958, "learning_rate": 0.0001, "loss": 4.0101, "loss/crossentropy": 2.106861114501953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18403494358062744, "step": 29920 }, { "epoch": 0.59844, "grad_norm": 1.8125, "grad_norm_var": 0.009830474853515625, "learning_rate": 0.0001, "loss": 3.9163, "loss/crossentropy": 2.0991535782814026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18648552894592285, "step": 29922 }, { "epoch": 0.59848, "grad_norm": 1.859375, "grad_norm_var": 0.0095123291015625, "learning_rate": 0.0001, "loss": 4.0917, "loss/crossentropy": 2.205238461494446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.192887581884861, "step": 29924 }, { "epoch": 0.59852, "grad_norm": 1.8671875, "grad_norm_var": 0.006807200113932292, "learning_rate": 0.0001, "loss": 4.1419, "loss/crossentropy": 1.8452889919281006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17111057043075562, "step": 29926 }, { "epoch": 0.59856, "grad_norm": 1.890625, "grad_norm_var": 0.0066640218098958336, "learning_rate": 0.0001, "loss": 3.8361, "loss/crossentropy": 2.1874170303344727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274078845977783, "step": 29928 }, { "epoch": 0.5986, "grad_norm": 1.8515625, "grad_norm_var": 0.009671783447265625, "learning_rate": 0.0001, "loss": 4.0676, "loss/crossentropy": 2.0940569639205933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19653251022100449, "step": 29930 }, { "epoch": 0.59864, "grad_norm": 1.8515625, "grad_norm_var": 0.009440104166666666, "learning_rate": 0.0001, "loss": 4.0597, "loss/crossentropy": 2.1147854328155518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19174247980117798, "step": 29932 }, { "epoch": 0.59868, "grad_norm": 1.8125, "grad_norm_var": 0.009279123942057292, "learning_rate": 0.0001, "loss": 3.8874, "loss/crossentropy": 1.9983789324760437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19001173973083496, "step": 29934 }, { "epoch": 0.59872, "grad_norm": 1.9453125, "grad_norm_var": 0.007264963785807292, "learning_rate": 0.0001, "loss": 3.8884, "loss/crossentropy": 2.0145105123519897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948821321129799, "step": 29936 }, { "epoch": 0.59876, "grad_norm": 1.7578125, "grad_norm_var": 0.0087890625, "learning_rate": 0.0001, "loss": 3.9149, "loss/crossentropy": 2.0878870487213135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19328270852565765, "step": 29938 }, { "epoch": 0.5988, "grad_norm": 2.0, "grad_norm_var": 0.008177693684895833, "learning_rate": 0.0001, "loss": 3.8649, "loss/crossentropy": 2.0433109998703003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1775122806429863, "step": 29940 }, { "epoch": 0.59884, "grad_norm": 1.8671875, "grad_norm_var": 0.016060384114583333, "learning_rate": 0.0001, "loss": 4.339, "loss/crossentropy": 2.45761239528656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20690447837114334, "step": 29942 }, { "epoch": 0.59888, "grad_norm": 2.171875, "grad_norm_var": 0.019052886962890626, "learning_rate": 0.0001, "loss": 4.2211, "loss/crossentropy": 2.149196147918701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22804293036460876, "step": 29944 }, { "epoch": 0.59892, "grad_norm": 1.8203125, "grad_norm_var": 0.0178131103515625, "learning_rate": 0.0001, "loss": 3.668, "loss/crossentropy": 2.208665132522583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886969953775406, "step": 29946 }, { "epoch": 0.59896, "grad_norm": 1.859375, "grad_norm_var": 0.017479451497395833, "learning_rate": 0.0001, "loss": 3.9187, "loss/crossentropy": 2.1634607315063477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016885057091713, "step": 29948 }, { "epoch": 0.599, "grad_norm": 1.7890625, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 4.0225, "loss/crossentropy": 2.289479374885559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.209372416138649, "step": 29950 }, { "epoch": 0.59904, "grad_norm": 1.8359375, "grad_norm_var": 0.019535064697265625, "learning_rate": 0.0001, "loss": 3.9492, "loss/crossentropy": 2.1859498023986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2030099779367447, "step": 29952 }, { "epoch": 0.59908, "grad_norm": 1.8515625, "grad_norm_var": 0.017496744791666668, "learning_rate": 0.0001, "loss": 3.8509, "loss/crossentropy": 2.1713619232177734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18735715746879578, "step": 29954 }, { "epoch": 0.59912, "grad_norm": 1.9609375, "grad_norm_var": 0.0174468994140625, "learning_rate": 0.0001, "loss": 4.1315, "loss/crossentropy": 2.0735312700271606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20961667597293854, "step": 29956 }, { "epoch": 0.59916, "grad_norm": 1.828125, "grad_norm_var": 0.009969075520833334, "learning_rate": 0.0001, "loss": 4.0316, "loss/crossentropy": 1.6638267636299133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16829392313957214, "step": 29958 }, { "epoch": 0.5992, "grad_norm": 1.8828125, "grad_norm_var": 0.005635325113932292, "learning_rate": 0.0001, "loss": 4.0428, "loss/crossentropy": 1.924504578113556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1855188086628914, "step": 29960 }, { "epoch": 0.59924, "grad_norm": 2.0625, "grad_norm_var": 0.0067942301432291664, "learning_rate": 0.0001, "loss": 4.0467, "loss/crossentropy": 2.2729889154434204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21448159217834473, "step": 29962 }, { "epoch": 0.59928, "grad_norm": 1.796875, "grad_norm_var": 0.007609049479166667, "learning_rate": 0.0001, "loss": 3.8208, "loss/crossentropy": 1.9595959186553955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21665186434984207, "step": 29964 }, { "epoch": 0.59932, "grad_norm": 2.0625, "grad_norm_var": 0.007002512613932292, "learning_rate": 0.0001, "loss": 3.9934, "loss/crossentropy": 2.208333909511566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20341281592845917, "step": 29966 }, { "epoch": 0.59936, "grad_norm": 1.9296875, "grad_norm_var": 0.00604248046875, "learning_rate": 0.0001, "loss": 3.9484, "loss/crossentropy": 2.0344181060791016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19173699617385864, "step": 29968 }, { "epoch": 0.5994, "grad_norm": 1.7421875, "grad_norm_var": 0.008063761393229167, "learning_rate": 0.0001, "loss": 4.0233, "loss/crossentropy": 1.9575786590576172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16551142930984497, "step": 29970 }, { "epoch": 0.59944, "grad_norm": 1.828125, "grad_norm_var": 0.008174387613932292, "learning_rate": 0.0001, "loss": 3.7948, "loss/crossentropy": 1.7442238330841064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16575023531913757, "step": 29972 }, { "epoch": 0.59948, "grad_norm": 1.8359375, "grad_norm_var": 0.00819091796875, "learning_rate": 0.0001, "loss": 3.9174, "loss/crossentropy": 2.3819799423217773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21309742331504822, "step": 29974 }, { "epoch": 0.59952, "grad_norm": 1.9296875, "grad_norm_var": 0.009165191650390625, "learning_rate": 0.0001, "loss": 3.8633, "loss/crossentropy": 2.097830832004547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17393425852060318, "step": 29976 }, { "epoch": 0.59956, "grad_norm": 1.9375, "grad_norm_var": 0.007382965087890625, "learning_rate": 0.0001, "loss": 3.7568, "loss/crossentropy": 1.771178424358368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18638163059949875, "step": 29978 }, { "epoch": 0.5996, "grad_norm": 1.96875, "grad_norm_var": 0.007289377848307291, "learning_rate": 0.0001, "loss": 4.2791, "loss/crossentropy": 2.419429302215576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20359767973423004, "step": 29980 }, { "epoch": 0.59964, "grad_norm": 1.890625, "grad_norm_var": 0.005881500244140625, "learning_rate": 0.0001, "loss": 3.7571, "loss/crossentropy": 2.0326388478279114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20969726145267487, "step": 29982 }, { "epoch": 0.59968, "grad_norm": 1.9453125, "grad_norm_var": 0.005983225504557292, "learning_rate": 0.0001, "loss": 4.0842, "loss/crossentropy": 1.9649672508239746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18691973388195038, "step": 29984 }, { "epoch": 0.59972, "grad_norm": 2.03125, "grad_norm_var": 0.00643310546875, "learning_rate": 0.0001, "loss": 3.8939, "loss/crossentropy": 1.9001884460449219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18768402934074402, "step": 29986 }, { "epoch": 0.59976, "grad_norm": 1.859375, "grad_norm_var": 0.006208292643229167, "learning_rate": 0.0001, "loss": 3.8919, "loss/crossentropy": 2.0225003361701965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2066223919391632, "step": 29988 }, { "epoch": 0.5998, "grad_norm": 1.796875, "grad_norm_var": 0.006451161702473959, "learning_rate": 0.0001, "loss": 3.9285, "loss/crossentropy": 2.175786852836609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20807426422834396, "step": 29990 }, { "epoch": 0.59984, "grad_norm": 1.71875, "grad_norm_var": 0.007228342692057291, "learning_rate": 0.0001, "loss": 3.7598, "loss/crossentropy": 1.842478632926941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17608553171157837, "step": 29992 }, { "epoch": 0.59988, "grad_norm": 1.796875, "grad_norm_var": 0.007746378580729167, "learning_rate": 0.0001, "loss": 3.8669, "loss/crossentropy": 2.1033515334129333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19302091002464294, "step": 29994 }, { "epoch": 0.59992, "grad_norm": 1.921875, "grad_norm_var": 0.006990559895833333, "learning_rate": 0.0001, "loss": 4.0286, "loss/crossentropy": 2.3058249950408936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1858331635594368, "step": 29996 }, { "epoch": 0.59996, "grad_norm": 1.890625, "grad_norm_var": 0.006200917561848958, "learning_rate": 0.0001, "loss": 4.1295, "loss/crossentropy": 2.3059096336364746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21149063110351562, "step": 29998 }, { "epoch": 0.6, "grad_norm": 1.90625, "grad_norm_var": 0.006029256184895833, "learning_rate": 0.0001, "loss": 3.9016, "loss/crossentropy": 2.0354926586151123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987614408135414, "step": 30000 } ], "logging_steps": 2, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.12442965983232e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }